Skip to content

Commit d0c3a7b

Browse files
aivanoufacebook-github-bot
authored andcommitted
Add automatic set of CUDA_VISIBLE_DEVICES for local scheduler (#383)
Summary: Pull Request resolved: #383 The diff adds automatic set of `CUDA_VISIBLE_DEVICES` based on `num_replicas`. Each replica gets the same number of devices The alg. applies only when `CUDA_VISIBLE_DEVICES` is not set The diff uses `nvidia-smi` to determine the number of GPUs #297 #377 Differential Revision: D34064433 fbshipit-source-id: 8436d962d4a3444608b4f86eb507598487c2cc5b
1 parent 8b62ea8 commit d0c3a7b

File tree

2 files changed

+181
-6
lines changed

2 files changed

+181
-6
lines changed

torchx/schedulers/local_scheduler.py

Lines changed: 78 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,9 @@ class ReplicaParam:
102102
env: Dict[str, str]
103103

104104
# IO stream files
105-
stdout: Optional[str]
106-
stderr: Optional[str]
107-
combined: Optional[str]
105+
stdout: Optional[str] = None
106+
stderr: Optional[str] = None
107+
combined: Optional[str] = None
108108

109109
cwd: Optional[str] = None
110110

@@ -592,6 +592,15 @@ def run_opts(self) -> runopts:
592592
help="if set, prepends CWD to replica's PATH env var"
593593
" making any binaries in CWD take precedence over those in PATH",
594594
)
595+
opts.add(
596+
"auto_set_cuda_devices",
597+
type_=bool,
598+
default=False,
599+
help="if set, sets the `CUDA_AVAILABLE_DEVICES` for roles that request GPU resources"
600+
" The parameter will try to assign available GPUs to all role-replicas"
601+
" in that requested GPU. If device count is less than total requested"
602+
" GPUs across role-replicas, the `CUDA_AVAILABLE_DEVICES` will not be set",
603+
)
595604
return opts
596605

597606
def _validate(self, app: AppDef, scheduler: SchedulerBackend) -> None:
@@ -768,6 +777,69 @@ def _submit_dryrun(
768777
request = self._to_popen_request(app, cfg)
769778
return AppDryRunInfo(request, lambda p: pprint.pformat(p, indent=2, width=80))
770779

780+
def _get_gpu_count(self) -> int:
781+
gpu_cmd = "nvidia-smi -L"
782+
try:
783+
log.debug(f"Running {gpu_cmd}")
784+
result = subprocess.run(
785+
gpu_cmd.split(), capture_output=True, text=True, check=True
786+
)
787+
log.debug(f"Cmd {gpu_cmd} returned: {result}")
788+
gpus_info = [gpu_info for gpu_info in result.stdout.split("\n") if gpu_info]
789+
return len(gpus_info)
790+
except subprocess.CalledProcessError as e:
791+
log.exception(f"Got exception while getting GPUs {e.stderr}")
792+
return 0
793+
except Exception:
794+
log.exception("Got exception while getting GPUs")
795+
return 0
796+
797+
def _set_cuda_devices_for_role_replica(
798+
self,
799+
replica: ReplicaParam,
800+
replica_id: int,
801+
requested_gpus: int,
802+
role_gpu_start_idx: int,
803+
) -> None:
804+
if requested_gpus <= 0:
805+
return
806+
start_device = role_gpu_start_idx + requested_gpus * replica_id
807+
end_device = role_gpu_start_idx + requested_gpus * (replica_id + 1)
808+
devices = list(range(start_device, end_device))
809+
visible_devices = ",".join([str(device) for device in devices])
810+
replica.env["CUDA_VISIBLE_DEVICES"] = visible_devices
811+
812+
def _update_env_cuda_visible_devices(
813+
self,
814+
role_params: Dict[str, List[ReplicaParam]],
815+
app: AppDef,
816+
cfg: Mapping[str, CfgVal],
817+
) -> None:
818+
device_count = 0
819+
total_requested_gpus = self._get_total_requested_gpus(app.roles)
820+
auto_set_cuda_devices = cfg.get("auto_set_cuda_devices", False)
821+
if auto_set_cuda_devices and total_requested_gpus > 0:
822+
device_count = self._get_gpu_count()
823+
if auto_set_cuda_devices and total_requested_gpus > device_count:
824+
auto_set_cuda_devices = False
825+
log.warning(
826+
"Cannot set `CUDA_VISIBLE_DEVICES` due to "
827+
f"Available GPUs {device_count} less than requested {total_requested_gpus}"
828+
)
829+
if not auto_set_cuda_devices:
830+
return
831+
role_gpu_start_idx = 0
832+
for role in app.roles:
833+
role_replicas = role_params[role.name]
834+
for replica_id, replica in enumerate(role_replicas):
835+
self._set_cuda_devices_for_role_replica(
836+
replica, replica_id, role.resource.gpu, role_gpu_start_idx
837+
)
838+
role_gpu_start_idx += role.resource.gpu * role.num_replicas
839+
840+
def _get_total_requested_gpus(self, roles: List[Role]) -> int:
841+
return sum([role.resource.gpu * role.num_replicas for role in roles])
842+
771843
def _to_popen_request(
772844
self,
773845
app: AppDef,
@@ -783,6 +855,7 @@ def _to_popen_request(
783855

784856
role_params: Dict[str, List[ReplicaParam]] = {}
785857
role_log_dirs: Dict[str, List[str]] = {}
858+
786859
for role in app.roles:
787860
replica_params = role_params.setdefault(role.name, [])
788861
replica_log_dirs = role_log_dirs.setdefault(role.name, [])
@@ -796,8 +869,8 @@ def _to_popen_request(
796869
replica_id=str(replica_id),
797870
)
798871
replica_role = values.apply(role)
799-
replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id))
800872

873+
replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id))
801874
if "TORCHELASTIC_ERROR_FILE" not in replica_role.env:
802875
# this is the top level (agent if using elastic role) error file
803876
# a.k.a scheduler reply file
@@ -816,7 +889,7 @@ def _to_popen_request(
816889
)
817890
)
818891
replica_log_dirs.append(replica_log_dir)
819-
892+
self._update_env_cuda_visible_devices(role_params, app, cfg)
820893
return PopenRequest(app_id, app_log_dir, role_params, role_log_dirs)
821894

822895
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:

torchx/schedulers/test/local_scheduler_test.py

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import time
1616
import unittest
1717
from contextlib import contextmanager
18+
from dataclasses import dataclass
1819
from datetime import datetime
1920
from os.path import join
2021
from typing import Callable, Generator, Optional
@@ -32,7 +33,7 @@
3233
join_PATH,
3334
make_unique,
3435
)
35-
from torchx.specs.api import AppDef, AppState, Role, is_terminal, macros
36+
from torchx.specs.api import AppDef, AppState, Role, is_terminal, macros, Resource
3637

3738
from .test_util import write_shell_script
3839

@@ -828,6 +829,104 @@ def test_close_twice(self) -> None:
828829
self.scheduler.close()
829830
# nothing to validate just make sure no errors are raised
830831

832+
def test_get_gpu_count(self) -> None:
833+
@dataclass
834+
class ProcResult:
835+
stdout: str
836+
837+
nvidia_smi_out = (
838+
"GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-196a22c5-717b-66db-0acc-58cde6f3df85)\n"
839+
"GPU 1: Tesla V100-SXM2-16GB (UUID: GPU-45e9165d-4f7e-d954-7ff5-481bc2c0ec7b)\n"
840+
"GPU 2: Tesla V100-SXM2-16GB (UUID: GPU-26e22503-5fd5-8f55-d068-e1714fbb6fd6)\n"
841+
"GPU 3: Tesla V100-SXM2-16GB (UUID: GPU-ebfc20c7-5f1a-1bc9-0d98-601cbe21fc2d)\n"
842+
)
843+
844+
stdout = nvidia_smi_out
845+
result = ProcResult(stdout)
846+
with patch("subprocess.run", return_value=result):
847+
gpu_count = self.scheduler._get_gpu_count()
848+
self.assertEqual(4, gpu_count)
849+
850+
def test_get_gpu_count_error(self) -> None:
851+
with patch("subprocess.run", side_effect=Exception("test error")):
852+
gpu_count = self.scheduler._get_gpu_count()
853+
self.assertEqual(0, gpu_count)
854+
855+
def test_set_cuda_devices_for_role_replica(self) -> None:
856+
replica_param1 = ReplicaParam(
857+
args=["a", "b"],
858+
env={},
859+
cwd="/home/bob",
860+
)
861+
replica_param2 = ReplicaParam(
862+
args=["a", "b"],
863+
env={},
864+
cwd="/home/bob",
865+
)
866+
self.scheduler._set_cuda_devices_for_role_replica(replica_param1, 0, 4, 0)
867+
self.assertEqual("0,1,2,3", replica_param1.env["CUDA_VISIBLE_DEVICES"])
868+
self.scheduler._set_cuda_devices_for_role_replica(replica_param2, 1, 8, 4)
869+
# start gpu is 4(request_gpu_start=4) + 8(replica_id=1)
870+
self.assertEqual(
871+
"12,13,14,15,16,17,18,19", replica_param2.env["CUDA_VISIBLE_DEVICES"]
872+
)
873+
874+
def test_get_cuda_devices_is_set(self) -> None:
875+
with patch.object(self.scheduler, "_get_gpu_count", return_value=16):
876+
appdef = AppDef(
877+
name="role1",
878+
roles=[
879+
Role(
880+
name="role1",
881+
image=self.test_dir,
882+
entrypoint="train",
883+
resource=Resource(gpu=2, cpu=0, memMB=0),
884+
num_replicas=2,
885+
),
886+
Role(
887+
name="role2",
888+
image=self.test_dir,
889+
entrypoint="train",
890+
resource=Resource(gpu=3, cpu=0, memMB=0),
891+
num_replicas=2,
892+
),
893+
],
894+
)
895+
popen_req = self.scheduler._to_popen_request(
896+
appdef, {"auto_set_cuda_devices": True}
897+
)
898+
role1_params = popen_req.role_params["role1"]
899+
self.assertEqual(2, len(role1_params))
900+
self.assertEqual("0,1", role1_params[0].env["CUDA_VISIBLE_DEVICES"])
901+
self.assertEqual("2,3", role1_params[1].env["CUDA_VISIBLE_DEVICES"])
902+
role2_params = popen_req.role_params["role2"]
903+
self.assertEqual(2, len(role2_params))
904+
self.assertEqual("4,5,6", role2_params[0].env["CUDA_VISIBLE_DEVICES"])
905+
self.assertEqual("7,8,9", role2_params[1].env["CUDA_VISIBLE_DEVICES"])
906+
907+
def test_get_cuda_devices_not_set(self) -> None:
908+
with patch.object(self.scheduler, "_get_gpu_count", return_value=8):
909+
trainer1 = AppDef(
910+
name="trainer1",
911+
roles=[
912+
Role(
913+
name="trainer1",
914+
image=self.test_dir,
915+
entrypoint="trainer1.sh",
916+
resource=Resource(gpu=4, cpu=0, memMB=0),
917+
num_replicas=4,
918+
)
919+
],
920+
)
921+
922+
popen_req = self.scheduler._to_popen_request(trainer1, {})
923+
role_params = popen_req.role_params["trainer1"]
924+
self.assertEqual(4, len(role_params))
925+
self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[0].env)
926+
self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[1].env)
927+
self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[2].env)
928+
self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[3].env)
929+
831930
def test_no_orphan_process_function(self) -> None:
832931
self._test_orphan_workflow()
833932

@@ -839,6 +938,9 @@ def _test_orphan_workflow(self) -> None:
839938
target=start_sleep_processes, args=(self.test_dir, mp_queue, child_nproc)
840939
)
841940
proc.start()
941+
# Before querying the queue we need to wait
942+
# Otherwise we will get `FileNotFoundError: [Errno 2] No such file or directory` error
943+
time.sleep(10)
842944
total_processes = child_nproc + 1
843945
pids = []
844946
for _ in range(total_processes):

0 commit comments

Comments
 (0)