Skip to content

Commit 328bcc6

Browse files
aivanoufacebook-github-bot
authored andcommitted
Add automatic set of CUDA_VISIBLE_DEVICES for local scheduler (pytorch#383)
Summary: Pull Request resolved: pytorch#383 The diff adds automatic set of `CUDA_VISIBLE_DEVICES` based on `num_replicas`. Each replica gets the same number of devices The alg. applies only when `CUDA_VISIBLE_DEVICES` is not set The diff uses `nvidia-smi` to determine the number of GPUs pytorch#297 pytorch#377 Differential Revision: D34064433 fbshipit-source-id: 788ce92b0ad79e24f4be22bb2d5e9f784f25004b
1 parent 4b989d5 commit 328bcc6

File tree

2 files changed

+104
-0
lines changed

2 files changed

+104
-0
lines changed

torchx/schedulers/local_scheduler.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,31 @@ def _submit_dryrun(
765765
request = self._to_popen_request(app, cfg)
766766
return AppDryRunInfo(request, lambda p: pprint.pformat(p, indent=2, width=80))
767767

768+
def _get_gpu_count(self) -> int:
769+
gpu_cmd = "nvidia-smi -L"
770+
try:
771+
result = subprocess.run(gpu_cmd.split(), capture_output=True)
772+
log.debug(f"Cmd {gpu_cmd} returned: {result}")
773+
gpus_info = result.stdout.decode().split("\n")[0:-1]
774+
return len(gpus_info)
775+
except Exception:
776+
log.debug(f"No GPUs detected via {gpu_cmd}")
777+
return 0
778+
779+
def _get_cuda_devices(self, replica_id: int, num_replicas: int) -> Optional[str]:
780+
gpu_device_count = self._get_gpu_count()
781+
gpu_bucket_size = int(gpu_device_count / num_replicas)
782+
if gpu_device_count != 0:
783+
devices = list(
784+
range(
785+
gpu_bucket_size * replica_id,
786+
gpu_bucket_size * (replica_id + 1),
787+
)
788+
)
789+
visible_devices = ",".join([str(d) for d in devices])
790+
return visible_devices
791+
return None
792+
768793
def _to_popen_request(
769794
self,
770795
app: AppDef,
@@ -786,6 +811,19 @@ def _to_popen_request(
786811

787812
img_root = image_provider.fetch_role(role)
788813

814+
gpu_device_count = self._get_gpu_count()
815+
if gpu_device_count != 0 and gpu_device_count < role.num_replicas:
816+
log.warning(
817+
"Different role replicas will occupy the same device"
818+
"Decreate the number of replicas by changing `role.num_replicas` parmeter "
819+
f"Devices detected: {gpu_device_count}, num replicas: {role.num_replicas}"
820+
)
821+
if gpu_device_count != 0 and gpu_device_count % role.num_replicas != 0:
822+
log.warning(
823+
"Number of detected gpus is not proportional to the number of replicas"
824+
f"GPUs detected: {gpu_device_count}, num replicas: {role.num_replicas}"
825+
)
826+
789827
for replica_id in range(role.num_replicas):
790828
values = macros.Values(
791829
img_root=img_root,
@@ -794,6 +832,12 @@ def _to_popen_request(
794832
)
795833
replica_role = values.apply(role)
796834
replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id))
835+
visible_devices = self._get_cuda_devices(replica_id, role.num_replicas)
836+
if visible_devices and "CUDA_VISIBLE_DEVICES" not in replica_role.env:
837+
log.debug(
838+
f"Setting role replica {role.num_replicas} with {visible_devices} devices"
839+
)
840+
replica_role.env["CUDA_VISIBLE_DEVICES"] = visible_devices
797841

798842
if "TORCHELASTIC_ERROR_FILE" not in replica_role.env:
799843
# this is the top level (agent if using elastic role) error file

torchx/schedulers/test/local_scheduler_test.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import time
1616
import unittest
1717
from contextlib import contextmanager
18+
from dataclasses import dataclass
1819
from datetime import datetime
1920
from os.path import join
2021
from typing import Callable, Generator, Optional
@@ -828,6 +829,62 @@ def test_close_twice(self) -> None:
828829
self.scheduler.close()
829830
# nothing to validate just make sure no errors are raised
830831

832+
def test_get_gpu_count(self) -> None:
833+
@dataclass
834+
class ProcResult:
835+
stdout: bytes
836+
837+
nvidia_smi_out = (
838+
"GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-196a22c5-717b-66db-0acc-58cde6f3df85)\n"
839+
"GPU 1: Tesla V100-SXM2-16GB (UUID: GPU-45e9165d-4f7e-d954-7ff5-481bc2c0ec7b)\n"
840+
"GPU 2: Tesla V100-SXM2-16GB (UUID: GPU-26e22503-5fd5-8f55-d068-e1714fbb6fd6)\n"
841+
"GPU 3: Tesla V100-SXM2-16GB (UUID: GPU-ebfc20c7-5f1a-1bc9-0d98-601cbe21fc2d)\n"
842+
)
843+
844+
stdout = nvidia_smi_out.encode()
845+
result = ProcResult(stdout)
846+
with patch("subprocess.run", return_value=result):
847+
gpu_count = self.scheduler._get_gpu_count()
848+
self.assertEqual(4, gpu_count)
849+
850+
def test_get_gpu_count_error(self) -> None:
851+
with patch("subprocess.run", side_effect=Exception("test error")):
852+
gpu_count = self.scheduler._get_gpu_count()
853+
self.assertEqual(0, gpu_count)
854+
855+
def test_get_cuda_devices(self) -> None:
856+
with patch.object(self.scheduler, "_get_gpu_count", return_value=8):
857+
self.assertEqual("0,1,2,3", self.scheduler._get_cuda_devices(0, 2))
858+
self.assertEqual("4,5,6,7", self.scheduler._get_cuda_devices(1, 2))
859+
with patch.object(self.scheduler, "_get_gpu_count", return_value=4):
860+
self.assertEqual("0", self.scheduler._get_cuda_devices(0, 4))
861+
self.assertEqual("1", self.scheduler._get_cuda_devices(1, 4))
862+
self.assertEqual("2", self.scheduler._get_cuda_devices(2, 4))
863+
self.assertEqual("3", self.scheduler._get_cuda_devices(3, 4))
864+
865+
def test_get_cuda_devices_is_set(self) -> None:
866+
with patch.object(self.scheduler, "_get_gpu_count", return_value=8):
867+
sleep_60sec = AppDef(
868+
name="sleep",
869+
roles=[
870+
Role(
871+
name="sleep",
872+
image=self.test_dir,
873+
entrypoint="sleep.sh",
874+
args=["60"],
875+
num_replicas=4,
876+
)
877+
],
878+
)
879+
880+
popen_req = self.scheduler._to_popen_request(sleep_60sec, {})
881+
role_params = popen_req.role_params["sleep"]
882+
self.assertEqual(4, len(role_params))
883+
self.assertEqual("0,1", role_params[0].env["CUDA_VISIBLE_DEVICES"])
884+
self.assertEqual("2,3", role_params[1].env["CUDA_VISIBLE_DEVICES"])
885+
self.assertEqual("4,5", role_params[2].env["CUDA_VISIBLE_DEVICES"])
886+
self.assertEqual("6,7", role_params[3].env["CUDA_VISIBLE_DEVICES"])
887+
831888
def test_no_orphan_process_function(self) -> None:
832889
self._test_orphan_workflow()
833890

@@ -839,6 +896,9 @@ def _test_orphan_workflow(self) -> None:
839896
target=start_sleep_processes, args=(self.test_dir, mp_queue, child_nproc)
840897
)
841898
proc.start()
899+
# Before querying the queue we need to wait
900+
# Otherwise we will get `FileNotFoundError: [Errno 2] No such file or directory` error
901+
time.sleep(10)
842902
total_processes = child_nproc + 1
843903
pids = []
844904
for _ in range(total_processes):

0 commit comments

Comments
 (0)