Add support for Sync BN

mannatsingh · facebook-github-bot · commit bd9888bb303d · 2020-03-09T10:09:33.000-07:00
Summary:
Added support for using sync batch normalization using PyTorch's implementation or Apex's.

Plugged in the model complexity hook to `classy_train.py`. It helps test the bug I encountered and fixed which needs the profiler + sync batch norm.

Differential Revision: D20307435

fbshipit-source-id: cf93dd50fff06c1d809f97c4267d4af9934564bb
diff --git a/classy_vision/tasks/classification_task.py b/classy_vision/tasks/classification_task.py
@@ -134,6 +134,7 @@ def __init__(self):
         self.amp_opt_level = None
         self.perf_log = []
         self.last_batch = None
+        self.sync_batch_norm = False
 
     def set_checkpoint(self, checkpoint):
         """Sets checkpoint on task.
@@ -282,6 +283,29 @@ def set_amp_opt_level(self, opt_level: Optional[str]):
             logging.info(f"AMP enabled with opt_level {opt_level}")
         return self
 
+    def set_sync_batch_norm(self, sync_batch_norm: bool) -> "ClassificationTask":
+        """Enable / disable sync batch norm.
+
+        Args:
+            sync_batch_norm: Set to True to enable and False otherwise.
+        Raises:
+            RuntimeError: If sync_batch_norm is True and apex is not installed.
+
+        Warning: apex needs to be installed to utilize this feature.
+        """
+        self.sync_batch_norm = sync_batch_norm
+        if sync_batch_norm:
+            """
+            if not apex_available:
+                raise RuntimeError(
+                    "apex is not installed, cannot enable sync_batch_norm"
+                )
+            """
+            logging.info("Using Synchronized Batch Normalization")
+        else:
+            logging.info("Synchronized Batch Normalization is disabled")
+        return self
+
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "ClassificationTask":
         """Instantiates a ClassificationTask from a configuration.
@@ -303,6 +327,7 @@ def from_config(cls, config: Dict[str, Any]) -> "ClassificationTask":
         loss = build_loss(config["loss"])
         test_only = config.get("test_only", False)
         amp_opt_level = config.get("amp_opt_level")
+        sync_batch_norm = config.get("sync_batch_norm", False)
         meters = build_meters(config.get("meters", {}))
         model = build_model(config["model"])
         # put model in eval mode in case any hooks modify model states, it'll
@@ -320,6 +345,7 @@ def from_config(cls, config: Dict[str, Any]) -> "ClassificationTask":
             .set_optimizer(optimizer)
             .set_meters(meters)
             .set_amp_opt_level(amp_opt_level)
+            .set_sync_batch_norm(sync_batch_norm)
             .set_distributed_options(
                 BroadcastBuffersMode[config.get("broadcast_buffers", "DISABLED")]
             )
@@ -498,6 +524,10 @@ def prepare(
             multiprocessing_context=dataloader_mp_context,
         )
 
+        if self.sync_batch_norm:
+            # self.base_model = apex.parallel.convert_syncbn_model(self.base_model)
+            self.base_model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.base_model)
+
         # move the model and loss to the right device
         if use_gpu:
             self.base_model, self.loss = copy_model_to_gpu(self.base_model, self.loss)
@@ -585,6 +615,8 @@ def get_classy_state(self, deep_copy: bool = False):
         Args:
             deep_copy: If true, does a deep copy of state before returning.
         """
+        from classy_vision.generic.distributed_util import get_world_size 
+        print("World size", get_world_size())
         classy_state_dict = {
             "train": self.train,
             "base_model": self.base_model.get_classy_state(),
diff --git a/test/generic/config_utils.py b/test/generic/config_utils.py
@@ -228,6 +228,7 @@ def get_test_mlp_task_config():
             "input_dim": 1200,
             "output_dim": 1000,
             "hidden_dims": [10],
+            "use_batchnorm": True,  # used for testing sync batchnorm
         },
         "meters": {"accuracy": {"topk": [1]}},
         "optimizer": {
diff --git a/test/trainer_distributed_trainer_test.py b/test/trainer_distributed_trainer_test.py
@@ -22,10 +22,13 @@ def setUp(self):
         config = get_test_mlp_task_config()
         invalid_config = copy.deepcopy(config)
         invalid_config["name"] = "invalid_task"
+        sync_bn_config = copy.deepcopy(config)
+        sync_bn_config["sync_batch_norm"] = True
         self.config_files = {}
         for config_key, config in [
             ("config", config),
             ("invalid_config", invalid_config),
+            ("sync_bn_config", sync_bn_config),
         ]:
             with tempfile.NamedTemporaryFile(mode="w", delete=False) as f:
                 json.dump(config, f)
@@ -37,7 +40,7 @@ def tearDown(self):
         for config_file in self.config_files.values():
             os.unlink(config_file)
 
-    def test_training(self):
+    def _test_training(self):
         """Checks we can train a small MLP model."""
 
         num_processes = 2
@@ -63,3 +66,25 @@ def test_training(self):
             result = subprocess.run(cmd, shell=True)
             success = result.returncode == 0
             self.assertEqual(success, expected_success)
+
+    def test_sync_batch_norm(self):
+        """Test that sync batch norm training doesn't hang."""
+
+        num_processes = 2
+        device = "gpu" if torch.cuda.is_available() else "cpu"
+
+        cmd = f"""{sys.executable} -m torch.distributed.launch \
+        --nnodes=1 \
+        --nproc_per_node={num_processes} \
+        --master_addr=localhost \
+        --master_port=29500 \
+        --use_env \
+        {self.path}/../classy_train.py \
+        --device={device} \
+        --config={self.config_files["sync_bn_config"]} \
+        --num_workers=4 \
+        --log_freq=100 \
+        --distributed_backend=ddp
+        """
+        result = subprocess.run(cmd, shell=True)
+        self.assertEqual(result.returncode, 0)