Skip to content

Commit f4fb0bf

Browse files
aivanoufacebook-github-bot
authored andcommitted
Add automatic set of CUDA_VISIBLE_DEVICES for local scheduler (#383)
Summary: Pull Request resolved: #383 The diff adds automatic set of `CUDA_VISIBLE_DEVICES` based on `num_replicas`. Each replica gets the same number of devices The alg. applies only when `CUDA_VISIBLE_DEVICES` is not set The diff uses `nvidia-smi` to determine the number of GPUs #297 #377 Differential Revision: D34064433 fbshipit-source-id: e9641e93fb487b38000f77c88b550a3149443f75
1 parent c7efc75 commit f4fb0bf

File tree

2 files changed

+162
-2
lines changed

2 files changed

+162
-2
lines changed

torchx/schedulers/local_scheduler.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,15 @@ def run_opts(self) -> runopts:
589589
help="if set, prepends CWD to replica's PATH env var"
590590
" making any binaries in CWD take precedence over those in PATH",
591591
)
592+
opts.add(
593+
"auto_set_cuda_devices",
594+
type_=bool,
595+
default=False,
596+
help="if set, sets the `CUDA_AVAILABLE_DEVICES` for roles that request GPU resources"
597+
" The parameter will try to assign available GPUs to all role-replicas"
598+
" in that requested GPU. If device count is less than total requested"
599+
" GPUs across role-replicas, the `CUDA_AVAILABLE_DEVICES` will not be set",
600+
)
592601
return opts
593602

594603
def _validate(self, app: AppDef, scheduler: SchedulerBackend) -> None:
@@ -765,6 +774,40 @@ def _submit_dryrun(
765774
request = self._to_popen_request(app, cfg)
766775
return AppDryRunInfo(request, lambda p: pprint.pformat(p, indent=2, width=80))
767776

777+
def _get_gpu_count(self) -> int:
778+
gpu_cmd = "nvidia-smi -L"
779+
try:
780+
log.debug(f"Running {gpu_cmd}")
781+
result = subprocess.run(
782+
gpu_cmd.split(), capture_output=True, text=True, check=True
783+
)
784+
log.debug(f"Cmd {gpu_cmd} returned: {result}")
785+
gpus_info = [gpu_info for gpu_info in result.stdout.split("\n") if gpu_info]
786+
return len(gpus_info)
787+
except subprocess.CalledProcessError as e:
788+
log.exception(f"Got exception while getting GPUs {e.stderr}")
789+
return 0
790+
except Exception:
791+
log.exception("Got exception while getting GPUs")
792+
return 0
793+
794+
def _set_cuda_devices(
795+
self,
796+
role: Role,
797+
replica_id: int,
798+
role_gpu_start_idx: int,
799+
) -> None:
800+
if role.resource.gpu <= 0:
801+
return
802+
start_device = role_gpu_start_idx + role.resource.gpu * replica_id
803+
end_device = role_gpu_start_idx + role.resource.gpu * (replica_id + 1)
804+
devices = list(range(start_device, end_device))
805+
visible_devices = ",".join([str(device) for device in devices])
806+
role.env["CUDA_VISIBLE_DEVICES"] = visible_devices
807+
808+
def _get_total_requested_gpus(self, roles: List[Role]) -> int:
809+
return sum([role.resource.gpu * role.num_replicas for role in roles])
810+
768811
def _to_popen_request(
769812
self,
770813
app: AppDef,
@@ -780,6 +823,20 @@ def _to_popen_request(
780823

781824
role_params: Dict[str, List[ReplicaParam]] = {}
782825
role_log_dirs: Dict[str, List[str]] = {}
826+
827+
device_count = 0
828+
total_requested_gpus = self._get_total_requested_gpus(app.roles)
829+
auto_set_cuda_devices = cfg.get("auto_set_cuda_devices", False)
830+
if auto_set_cuda_devices and total_requested_gpus > 0:
831+
device_count = self._get_gpu_count()
832+
if auto_set_cuda_devices and total_requested_gpus > device_count:
833+
auto_set_cuda_devices = False
834+
log.warning(
835+
"Cannot set `CUDA_VISIBLE_DEVICES` due to "
836+
f"Available GPUs {device_count} less than requested {total_requested_gpus}"
837+
)
838+
839+
role_gpu_start_idx = 0
783840
for role in app.roles:
784841
replica_params = role_params.setdefault(role.name, [])
785842
replica_log_dirs = role_log_dirs.setdefault(role.name, [])
@@ -793,8 +850,10 @@ def _to_popen_request(
793850
replica_id=str(replica_id),
794851
)
795852
replica_role = values.apply(role)
796-
replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id))
853+
if auto_set_cuda_devices:
854+
self._set_cuda_devices(replica_role, replica_id, role_gpu_start_idx)
797855

856+
replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id))
798857
if "TORCHELASTIC_ERROR_FILE" not in replica_role.env:
799858
# this is the top level (agent if using elastic role) error file
800859
# a.k.a scheduler reply file
@@ -813,6 +872,7 @@ def _to_popen_request(
813872
)
814873
)
815874
replica_log_dirs.append(replica_log_dir)
875+
role_gpu_start_idx += role.resource.gpu * role.num_replicas
816876

817877
return PopenRequest(app_id, app_log_dir, role_params, role_log_dirs)
818878

torchx/schedulers/test/local_scheduler_test.py

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import time
1616
import unittest
1717
from contextlib import contextmanager
18+
from dataclasses import dataclass
1819
from datetime import datetime
1920
from os.path import join
2021
from typing import Callable, Generator, Optional
@@ -32,7 +33,7 @@
3233
join_PATH,
3334
make_unique,
3435
)
35-
from torchx.specs.api import AppDef, AppState, Role, is_terminal, macros
36+
from torchx.specs.api import AppDef, AppState, Role, is_terminal, macros, Resource
3637

3738
from .test_util import write_shell_script
3839

@@ -828,6 +829,102 @@ def test_close_twice(self) -> None:
828829
self.scheduler.close()
829830
# nothing to validate just make sure no errors are raised
830831

832+
def test_get_gpu_count(self) -> None:
833+
@dataclass
834+
class ProcResult:
835+
stdout: str
836+
837+
nvidia_smi_out = (
838+
"GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-196a22c5-717b-66db-0acc-58cde6f3df85)\n"
839+
"GPU 1: Tesla V100-SXM2-16GB (UUID: GPU-45e9165d-4f7e-d954-7ff5-481bc2c0ec7b)\n"
840+
"GPU 2: Tesla V100-SXM2-16GB (UUID: GPU-26e22503-5fd5-8f55-d068-e1714fbb6fd6)\n"
841+
"GPU 3: Tesla V100-SXM2-16GB (UUID: GPU-ebfc20c7-5f1a-1bc9-0d98-601cbe21fc2d)\n"
842+
)
843+
844+
stdout = nvidia_smi_out
845+
result = ProcResult(stdout)
846+
with patch("subprocess.run", return_value=result):
847+
gpu_count = self.scheduler._get_gpu_count()
848+
self.assertEqual(4, gpu_count)
849+
850+
def test_get_gpu_count_error(self) -> None:
851+
with patch("subprocess.run", side_effect=Exception("test error")):
852+
gpu_count = self.scheduler._get_gpu_count()
853+
self.assertEqual(0, gpu_count)
854+
855+
def test_set_cuda_devices(self) -> None:
856+
role = Role(
857+
name="sleep",
858+
image=self.test_dir,
859+
entrypoint="sleep.sh",
860+
args=["60"],
861+
num_replicas=4,
862+
resource=Resource(gpu=4, cpu=0, memMB=0),
863+
)
864+
self.scheduler._set_cuda_devices(role, 0, 0)
865+
self.assertEqual("0,1,2,3", role.env["CUDA_VISIBLE_DEVICES"])
866+
self.scheduler._set_cuda_devices(role, 1, 0)
867+
self.assertEqual("4,5,6,7", role.env["CUDA_VISIBLE_DEVICES"])
868+
self.scheduler._set_cuda_devices(role, 1, 10)
869+
self.assertEqual("14,15,16,17", role.env["CUDA_VISIBLE_DEVICES"])
870+
871+
def test_get_cuda_devices_is_set(self) -> None:
872+
with patch.object(self.scheduler, "_get_gpu_count", return_value=16):
873+
appdef = AppDef(
874+
name="role1",
875+
roles=[
876+
Role(
877+
name="role1",
878+
image=self.test_dir,
879+
entrypoint="train",
880+
resource=Resource(gpu=2, cpu=0, memMB=0),
881+
num_replicas=2,
882+
),
883+
Role(
884+
name="role2",
885+
image=self.test_dir,
886+
entrypoint="train",
887+
resource=Resource(gpu=3, cpu=0, memMB=0),
888+
num_replicas=2,
889+
),
890+
],
891+
)
892+
893+
popen_req = self.scheduler._to_popen_request(
894+
appdef, {"auto_set_cuda_devices": True}
895+
)
896+
role1_params = popen_req.role_params["role1"]
897+
self.assertEqual(2, len(role1_params))
898+
self.assertEqual("0,1", role1_params[0].env["CUDA_VISIBLE_DEVICES"])
899+
self.assertEqual("2,3", role1_params[1].env["CUDA_VISIBLE_DEVICES"])
900+
role2_params = popen_req.role_params["role2"]
901+
self.assertEqual(2, len(role2_params))
902+
self.assertEqual("4,5,6", role2_params[0].env["CUDA_VISIBLE_DEVICES"])
903+
self.assertEqual("7,8,9", role2_params[1].env["CUDA_VISIBLE_DEVICES"])
904+
905+
def test_get_cuda_devices_not_set(self) -> None:
906+
with patch.object(self.scheduler, "_get_gpu_count", return_value=8):
907+
trainer1 = AppDef(
908+
name="trainer1",
909+
roles=[
910+
Role(
911+
name="trainer1",
912+
image=self.test_dir,
913+
entrypoint="trainer1.sh",
914+
resource=Resource(gpu=4, cpu=0, memMB=0),
915+
num_replicas=4,
916+
)
917+
],
918+
)
919+
920+
popen_req = self.scheduler._to_popen_request(trainer1, {})
921+
role_params = popen_req.role_params["trainer1"]
922+
self.assertEqual(4, len(role_params))
923+
self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[0].env)
924+
self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[1].env)
925+
self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[2].env)
926+
self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[3].env)
927+
831928
def test_no_orphan_process_function(self) -> None:
832929
self._test_orphan_workflow()
833930

@@ -839,6 +936,9 @@ def _test_orphan_workflow(self) -> None:
839936
target=start_sleep_processes, args=(self.test_dir, mp_queue, child_nproc)
840937
)
841938
proc.start()
939+
# Before querying the queue we need to wait
940+
# Otherwise we will get `FileNotFoundError: [Errno 2] No such file or directory` error
941+
time.sleep(10)
842942
total_processes = child_nproc + 1
843943
pids = []
844944
for _ in range(total_processes):

0 commit comments

Comments
 (0)