warning if current device index is lower than current local rank

HelioStrike · HelioStrike · commit 16d021d83497 · 2020-10-03T13:29:21.000+05:30
diff --git a/ignite/distributed/comp_models/horovod.py b/ignite/distributed/comp_models/horovod.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 from typing import Callable, Mapping, Optional, Tuple
 
 import torch
@@ -97,6 +98,8 @@ def get_node_rank(self) -> int:
         def device(self) -> torch.device:
             if torch.cuda.is_available():
                 index = torch.cuda.current_device()
+                if index < self.get_local_rank():
+                    warnings.warn("Current device index is less than current local rank.")
                 return torch.device("cuda:{}".format(index))
             return torch.device("cpu")
 
diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py
@@ -221,6 +221,8 @@ def get_node_rank(self) -> int:
         def device(self) -> torch.device:
             if self.backend() == dist.Backend.NCCL:
                 index = torch.cuda.current_device()
+                if index < self.get_local_rank():
+                    warnings.warn("Current device index is less than current local rank.")
                 return torch.device("cuda:{}".format(index))
             return torch.device("cpu")
 
diff --git a/tests/ignite/distributed/comp_models/test_horovod.py b/tests/ignite/distributed/comp_models/test_horovod.py
@@ -195,3 +195,30 @@ def test__hvd_dist_model_spawn_cuda():
         nproc_per_node=num_workers_per_machine,
         use_gloo=True,
     )
+
+
+"""
+@pytest.mark.distributed
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
+@pytest.mark.timeout(10) // Doesn't do anything
+def test__warning_if_deviceindex_less_than_localrank():
+    import os
+    import torch.distributed as dist
+    import ignite.distributed as idist
+
+    os.environ["RANK"] = "1"
+    os.environ["WORLD_SIZE"] = "1"
+    os.environ["MASTER_ADDR"] = "0.0.0.0"
+    os.environ["MASTER_PORT"] = "2222"
+
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    pytest.warns("Current device index is less than current local rank.", _HorovodDistModel.get_world_size)
+
+    dist.destroy_process_group()
+
+    del os.environ["RANK"]
+    del os.environ["WORLD_SIZE"]
+    del os.environ["MASTER_ADDR"]
+    del os.environ["MASTER_PORT"]
+"""
diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py
@@ -299,3 +299,30 @@ def test__native_dist_model_spawn_gloo():
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test__native_dist_model_spawn_nccl():
     _test__native_dist_model_spawn("nccl", num_workers_per_machine=torch.cuda.device_count(), device="cuda")
+
+
+"""
+@pytest.mark.distributed
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
+@pytest.mark.timeout(10)  // Doesn't do anything
+def test__warning_if_deviceindex_less_than_localrank():
+    import os
+    import torch.distributed as dist
+    import ignite.distributed as idist
+
+    os.environ["RANK"] = "1"
+    os.environ["WORLD_SIZE"] = "1"
+    os.environ["MASTER_ADDR"] = "0.0.0.0"
+    os.environ["MASTER_PORT"] = "2222"
+
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    pytest.warns("Current device index is less than current local rank.", idist.get_world_size)
+
+    dist.destroy_process_group()
+
+    del os.environ["RANK"]
+    del os.environ["WORLD_SIZE"]
+    del os.environ["MASTER_ADDR"]
+    del os.environ["MASTER_PORT"]
+"""