Add automatic set of CUDA_VISIBLE_DEVICES for local scheduler (#383)

aivanou · facebook-github-bot · commit f4fb0bf72b3c · 2022-02-10T09:42:01.000-08:00
Summary: Pull Request resolved: #383 The diff adds automatic set of `CUDA_VISIBLE_DEVICES` based on `num_replicas`. Each replica gets the same number of devices The alg. applies only when `CUDA_VISIBLE_DEVICES` is not set The diff uses `nvidia-smi` to determine the number of GPUs #297 #377 Differential Revision: D34064433 fbshipit-source-id: e9641e93fb487b38000f77c88b550a3149443f75
diff --git a/torchx/schedulers/local_scheduler.py b/torchx/schedulers/local_scheduler.py
@@ -589,6 +589,15 @@ def run_opts(self) -> runopts:
             help="if set, prepends CWD to replica's PATH env var"
             " making any binaries in CWD take precedence over those in PATH",
         )
+        opts.add(
+            "auto_set_cuda_devices",
+            type_=bool,
+            default=False,
+            help="if set, sets the `CUDA_AVAILABLE_DEVICES` for roles that request GPU resources"
+            " The parameter will try to assign available GPUs to all role-replicas"
+            " in that requested GPU. If device count is less than total requested"
+            " GPUs across role-replicas, the `CUDA_AVAILABLE_DEVICES` will not be set",
+        )
         return opts
 
     def _validate(self, app: AppDef, scheduler: SchedulerBackend) -> None:
@@ -765,6 +774,40 @@ def _submit_dryrun(
         request = self._to_popen_request(app, cfg)
         return AppDryRunInfo(request, lambda p: pprint.pformat(p, indent=2, width=80))
 
+    def _get_gpu_count(self) -> int:
+        gpu_cmd = "nvidia-smi -L"
+        try:
+            log.debug(f"Running {gpu_cmd}")
+            result = subprocess.run(
+                gpu_cmd.split(), capture_output=True, text=True, check=True
+            )
+            log.debug(f"Cmd {gpu_cmd} returned: {result}")
+            gpus_info = [gpu_info for gpu_info in result.stdout.split("\n") if gpu_info]
+            return len(gpus_info)
+        except subprocess.CalledProcessError as e:
+            log.exception(f"Got exception while getting GPUs {e.stderr}")
+            return 0
+        except Exception:
+            log.exception("Got exception while getting GPUs")
+            return 0
+
+    def _set_cuda_devices(
+        self,
+        role: Role,
+        replica_id: int,
+        role_gpu_start_idx: int,
+    ) -> None:
+        if role.resource.gpu <= 0:
+            return
+        start_device = role_gpu_start_idx + role.resource.gpu * replica_id
+        end_device = role_gpu_start_idx + role.resource.gpu * (replica_id + 1)
+        devices = list(range(start_device, end_device))
+        visible_devices = ",".join([str(device) for device in devices])
+        role.env["CUDA_VISIBLE_DEVICES"] = visible_devices
+
+    def _get_total_requested_gpus(self, roles: List[Role]) -> int:
+        return sum([role.resource.gpu * role.num_replicas for role in roles])
+
     def _to_popen_request(
         self,
         app: AppDef,
@@ -780,6 +823,20 @@ def _to_popen_request(
 
         role_params: Dict[str, List[ReplicaParam]] = {}
         role_log_dirs: Dict[str, List[str]] = {}
+
+        device_count = 0
+        total_requested_gpus = self._get_total_requested_gpus(app.roles)
+        auto_set_cuda_devices = cfg.get("auto_set_cuda_devices", False)
+        if auto_set_cuda_devices and total_requested_gpus > 0:
+            device_count = self._get_gpu_count()
+        if auto_set_cuda_devices and total_requested_gpus > device_count:
+            auto_set_cuda_devices = False
+            log.warning(
+                "Cannot set `CUDA_VISIBLE_DEVICES` due to "
+                f"Available GPUs {device_count} less than requested {total_requested_gpus}"
+            )
+
+        role_gpu_start_idx = 0
         for role in app.roles:
             replica_params = role_params.setdefault(role.name, [])
             replica_log_dirs = role_log_dirs.setdefault(role.name, [])
@@ -793,8 +850,10 @@ def _to_popen_request(
                     replica_id=str(replica_id),
                 )
                 replica_role = values.apply(role)
-                replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id))
+                if auto_set_cuda_devices:
+                    self._set_cuda_devices(replica_role, replica_id, role_gpu_start_idx)
 
+                replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id))
                 if "TORCHELASTIC_ERROR_FILE" not in replica_role.env:
                     # this is the top level (agent if using elastic role) error file
                     # a.k.a scheduler reply file
@@ -813,6 +872,7 @@ def _to_popen_request(
                     )
                 )
                 replica_log_dirs.append(replica_log_dir)
+            role_gpu_start_idx += role.resource.gpu * role.num_replicas
 
         return PopenRequest(app_id, app_log_dir, role_params, role_log_dirs)
 
diff --git a/torchx/schedulers/test/local_scheduler_test.py b/torchx/schedulers/test/local_scheduler_test.py
@@ -15,6 +15,7 @@
 import time
 import unittest
 from contextlib import contextmanager
+from dataclasses import dataclass
 from datetime import datetime
 from os.path import join
 from typing import Callable, Generator, Optional
@@ -32,7 +33,7 @@
     join_PATH,
     make_unique,
 )
-from torchx.specs.api import AppDef, AppState, Role, is_terminal, macros
+from torchx.specs.api import AppDef, AppState, Role, is_terminal, macros, Resource
 
 from .test_util import write_shell_script
 
@@ -828,6 +829,102 @@ def test_close_twice(self) -> None:
         self.scheduler.close()
         # nothing to validate just make sure no errors are raised
 
+    def test_get_gpu_count(self) -> None:
+        @dataclass
+        class ProcResult:
+            stdout: str
+
+        nvidia_smi_out = (
+            "GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-196a22c5-717b-66db-0acc-58cde6f3df85)\n"
+            "GPU 1: Tesla V100-SXM2-16GB (UUID: GPU-45e9165d-4f7e-d954-7ff5-481bc2c0ec7b)\n"
+            "GPU 2: Tesla V100-SXM2-16GB (UUID: GPU-26e22503-5fd5-8f55-d068-e1714fbb6fd6)\n"
+            "GPU 3: Tesla V100-SXM2-16GB (UUID: GPU-ebfc20c7-5f1a-1bc9-0d98-601cbe21fc2d)\n"
+        )
+
+        stdout = nvidia_smi_out
+        result = ProcResult(stdout)
+        with patch("subprocess.run", return_value=result):
+            gpu_count = self.scheduler._get_gpu_count()
+            self.assertEqual(4, gpu_count)
+
+    def test_get_gpu_count_error(self) -> None:
+        with patch("subprocess.run", side_effect=Exception("test error")):
+            gpu_count = self.scheduler._get_gpu_count()
+            self.assertEqual(0, gpu_count)
+
+    def test_set_cuda_devices(self) -> None:
+        role = Role(
+            name="sleep",
+            image=self.test_dir,
+            entrypoint="sleep.sh",
+            args=["60"],
+            num_replicas=4,
+            resource=Resource(gpu=4, cpu=0, memMB=0),
+        )
+        self.scheduler._set_cuda_devices(role, 0, 0)
+        self.assertEqual("0,1,2,3", role.env["CUDA_VISIBLE_DEVICES"])
+        self.scheduler._set_cuda_devices(role, 1, 0)
+        self.assertEqual("4,5,6,7", role.env["CUDA_VISIBLE_DEVICES"])
+        self.scheduler._set_cuda_devices(role, 1, 10)
+        self.assertEqual("14,15,16,17", role.env["CUDA_VISIBLE_DEVICES"])
+
+    def test_get_cuda_devices_is_set(self) -> None:
+        with patch.object(self.scheduler, "_get_gpu_count", return_value=16):
+            appdef = AppDef(
+                name="role1",
+                roles=[
+                    Role(
+                        name="role1",
+                        image=self.test_dir,
+                        entrypoint="train",
+                        resource=Resource(gpu=2, cpu=0, memMB=0),
+                        num_replicas=2,
+                    ),
+                    Role(
+                        name="role2",
+                        image=self.test_dir,
+                        entrypoint="train",
+                        resource=Resource(gpu=3, cpu=0, memMB=0),
+                        num_replicas=2,
+                    ),
+                ],
+            )
+
+            popen_req = self.scheduler._to_popen_request(
+                appdef, {"auto_set_cuda_devices": True}
+            )
+            role1_params = popen_req.role_params["role1"]
+            self.assertEqual(2, len(role1_params))
+            self.assertEqual("0,1", role1_params[0].env["CUDA_VISIBLE_DEVICES"])
+            self.assertEqual("2,3", role1_params[1].env["CUDA_VISIBLE_DEVICES"])
+            role2_params = popen_req.role_params["role2"]
+            self.assertEqual(2, len(role2_params))
+            self.assertEqual("4,5,6", role2_params[0].env["CUDA_VISIBLE_DEVICES"])
+            self.assertEqual("7,8,9", role2_params[1].env["CUDA_VISIBLE_DEVICES"])
+
+    def test_get_cuda_devices_not_set(self) -> None:
+        with patch.object(self.scheduler, "_get_gpu_count", return_value=8):
+            trainer1 = AppDef(
+                name="trainer1",
+                roles=[
+                    Role(
+                        name="trainer1",
+                        image=self.test_dir,
+                        entrypoint="trainer1.sh",
+                        resource=Resource(gpu=4, cpu=0, memMB=0),
+                        num_replicas=4,
+                    )
+                ],
+            )
+
+            popen_req = self.scheduler._to_popen_request(trainer1, {})
+            role_params = popen_req.role_params["trainer1"]
+            self.assertEqual(4, len(role_params))
+            self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[0].env)
+            self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[1].env)
+            self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[2].env)
+            self.assertFalse("CUDA_VISIBLE_DEVICES" in role_params[3].env)
+
     def test_no_orphan_process_function(self) -> None:
         self._test_orphan_workflow()
 
@@ -839,6 +936,9 @@ def _test_orphan_workflow(self) -> None:
             target=start_sleep_processes, args=(self.test_dir, mp_queue, child_nproc)
         )
         proc.start()
+        # Before querying the queue we need to wait
+        # Otherwise we will get `FileNotFoundError: [Errno 2] No such file or directory` error
+        time.sleep(10)
         total_processes = child_nproc + 1
         pids = []
         for _ in range(total_processes):