Pt2Pipeline input_transform just before model.compile (#2141)

Ivan Kobzarev · facebook-github-bot · commit cf62594ced2e · 2024-06-20T15:53:20.000-07:00
Summary: Pull Request resolved: #2141 input_transformer is used to do pt2 hints like torch._dynamo.mark_dynamic/mark_unbacked that should happen exactly on the Tensor objects passed to model.compile Before that it was called before _to_device() that will produce new Tensor objects. Reviewed By: TroyGarden Differential Revision: D58825466 fbshipit-source-id: cd05595d03b5f7b05eabd3fbb5e2c139dcc962b6
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -28,6 +28,7 @@
 
 import torch
 from torch.autograd.profiler import record_function
+from torchrec.distributed.comm_ops import set_use_sync_collectives
 from torchrec.distributed.dist_data import KJTAllToAllTensorsAwaitable
 from torchrec.distributed.model_parallel import ShardedModule
 from torchrec.distributed.train_pipeline.utils import (
@@ -208,17 +209,24 @@ def __init__(
         self._cur_batch: Optional[In] = None
 
     def progress(self, dataloader_iter: Iterator[In]) -> Out:
+        if self._iter == 0:
+            # Turn on sync collectives for PT2 pipeline.
+            # To have similar logic between compiled/graph_break ranks.
+            set_use_sync_collectives(True)
+
         cc = self._compile_configs
 
         with record_function("## load_batch ##"):
             cur_batch = next(dataloader_iter)
 
-        if self._input_transformer:
-            cur_batch = self._input_transformer(cur_batch)
-
         with record_function("## copy_batch_to_gpu ##"):
             self._cur_batch = _to_device(cur_batch, self._device, non_blocking=False)
 
+        # Input transformer here is used also for pt2 hints to compiler, that should happen on exact object passed to model.compile.
+        # Do not move it before _to_device
+        if self._input_transformer:
+            self._cur_batch = self._input_transformer(self._cur_batch)
+
         if self._model.training:
             with record_function("## zero_grad ##"):
                 self._optimizer.zero_grad()