warning if current device index is lower than current local rank

HelioStrike · HelioStrike · commit eaac5ea1ae5e · 2020-10-05T07:59:26.000+05:30
diff --git a/ignite/distributed/comp_models/horovod.py b/ignite/distributed/comp_models/horovod.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 from typing import Callable, Mapping, Optional, Tuple
 
 import torch
@@ -97,6 +98,8 @@ def get_node_rank(self) -> int:
         def device(self) -> torch.device:
             if torch.cuda.is_available():
                 index = torch.cuda.current_device()
+                if index < self.get_local_rank():
+                    warnings.warn("Current device index is less than current local rank.")
                 return torch.device("cuda:{}".format(index))
             return torch.device("cpu")
 
diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py
@@ -221,6 +221,8 @@ def get_node_rank(self) -> int:
         def device(self) -> torch.device:
             if self.backend() == dist.Backend.NCCL:
                 index = torch.cuda.current_device()
+                if index < self.get_local_rank():
+                    warnings.warn("Current device index is less than current local rank.")
                 return torch.device("cuda:{}".format(index))
             return torch.device("cpu")
 
diff --git a/tests/ignite/distributed/comp_models/test_horovod.py b/tests/ignite/distributed/comp_models/test_horovod.py
@@ -195,3 +195,25 @@ def test__hvd_dist_model_spawn_cuda():
         nproc_per_node=num_workers_per_machine,
         use_gloo=True,
     )
+
+
+@pytest.mark.distributed
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Skip if less than 2 GPUs")
+def test__warning_if_deviceindex_less_than_localrank(world_size):
+    import os
+    import torch.distributed as dist
+
+    os.environ["RANK"] = "1"
+    os.environ["MASTER_ADDR"] = "0.0.0.0"
+    os.environ["MASTER_PORT"] = "2222"
+
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    with pytest.warns(UserWarning, match=r"Current device index is less than current local rank."):
+        _HorovodDistModel.get_world_size()
+
+    dist.destroy_process_group()
+
+    del os.environ["RANK"]
+    del os.environ["MASTER_ADDR"]
+    del os.environ["MASTER_PORT"]
diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py
@@ -299,3 +299,26 @@ def test__native_dist_model_spawn_gloo():
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test__native_dist_model_spawn_nccl():
     _test__native_dist_model_spawn("nccl", num_workers_per_machine=torch.cuda.device_count(), device="cuda")
+
+
+@pytest.mark.distributed
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Skip if less than 2 GPUs")
+def test__warning_if_deviceindex_less_than_localrank(world_size):
+    import os
+    import torch.distributed as dist
+    import ignite.distributed as idist
+
+    os.environ["RANK"] = "1"
+    os.environ["MASTER_ADDR"] = "0.0.0.0"
+    os.environ["MASTER_PORT"] = "2222"
+
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    with pytest.warns(UserWarning, match=r"Current device index is less than current local rank."):
+        idist.get_world_size()
+
+    dist.destroy_process_group()
+
+    del os.environ["RANK"]
+    del os.environ["MASTER_ADDR"]
+    del os.environ["MASTER_PORT"]