Overlap comms on backward pass

dstaay-fb · facebook-github-bot · commit 46fa38d48dad · 2024-06-15T01:14:09.000-07:00
Summary:
Resolves issues around cuda streams / NCCL Deadlock with autograd.

Basically create seperate streams per pipelined embedding arch.

Differential Revision: D58220332
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -554,15 +554,8 @@ def __init__(
         self._stash_gradients = stash_gradients
 
         # use two data streams to support two concurrent batches
-        self._embedding_odd_stream: Optional[torch.cuda.streams.Stream] = (
-            (torch.cuda.Stream(priority=0)) if device.type == "cuda" else None
-        )
-        self._embedding_even_stream: Optional[torch.cuda.streams.Stream] = (
-            (torch.cuda.Stream(priority=0)) if device.type == "cuda" else None
-        )
-        self._overarch_stream: Optional[torch.cuda.streams.Stream] = (
-            (torch.cuda.Stream(priority=-1)) if device.type == "cuda" else None
-        )
+        self._embedding_odd_streams: List[Optional[torch.cuda.streams.Stream]] = []
+        self._embedding_even_streams: List[Optional[torch.cuda.streams.Stream]] = []
         self._gradients: Dict[str, torch.Tensor] = {}
 
     def _grad_swap(self) -> None:
@@ -572,6 +565,25 @@ def _grad_swap(self) -> None:
                 self._gradients[name] = param.grad.clone()
             param.grad = grad
 
+    def _init_embedding_streams(self) -> None:
+
+        for _ in self._pipelined_modules:
+            self._embedding_odd_streams.append(
+                torch.cuda.Stream(priority=0) if self._device.type == "cuda" else None
+            )
+            self._embedding_even_streams.append(
+                torch.cuda.Stream(priority=0) if self._device.type == "cuda" else None
+            )
+
+    def _validate_optimizer(self) -> None:
+        for pipelined_module in self._pipelined_modules:
+            pipelined_params = set(pipelined_module.parameters())
+            for group in self._optimizer.param_groups:
+                if not set(group["params"]).isdisjoint(pipelined_params):
+                    logger.warning(
+                        f"SemiSync pipelined {type(pipelined_module)} and MLP optimizer share parameters"
+                    )
+
     def fill_pipeline(self, dataloader_iter: Iterator[In]) -> None:
         # pipeline is already filled
         if len(self.batches) >= 3:
@@ -591,7 +603,9 @@ def fill_pipeline(self, dataloader_iter: Iterator[In]) -> None:
             # pyre-ignore [6]
             EmbeddingPipelinedForward,
         )
+        self._init_embedding_streams()
         self.wait_sparse_data_dist(self.contexts[0])
+        self._validate_optimizer()
         # pyre-ignore [6]
         self.start_embedding_lookup(self.batches[0], self.contexts[0])
 
@@ -645,25 +659,25 @@ def progress(self, dataloader_iter: Iterator[In]) -> Out:
             self.wait_sparse_data_dist(self.contexts[2])
 
         if self._model.training:
-            # backward would put an implicit sync point in stream called from, ideally
-            # this would different from optimizer so it could start earilier, but currently not safe to do so.
-            with torch.cuda.stream(self._overarch_stream):
-                with record_function(f"## backward {self.contexts[0].index} ##"):
-                    torch.sum(losses, dim=0).backward()
-
-                with record_function(
-                    f"## optimizer {cast(int, self.contexts[0].index) - 1} ##"
-                ):
-                    if self.is_semi_sync() and self._stash_gradients:
-                        self._grad_swap()
-                    self._mlp_optimizer_step()
+            with record_function(f"## backward {self.contexts[0].index} ##"):
+                torch.sum(losses, dim=0).backward()
+            # pyre-ignore [6]
+            self.embedding_backward(self.contexts[0])
 
-                with record_function(
-                    f"## zero_grad {cast(int, self.contexts[0].index) - 1} ##"
-                ):
-                    self._optimizer.zero_grad()
+            with record_function(
+                f"## optimizer {cast(int, self.contexts[0].index) - 1} ##"
+            ):
+                if self.is_semi_sync() and self._stash_gradients:
+                    self._grad_swap()
+                self._mlp_optimizer_step()
+
+            with record_function(
+                f"## zero_grad {cast(int, self.contexts[0].index) - 1} ##"
+            ):
+                self._optimizer.zero_grad()
 
         if len(self.batches) >= 2 and not self.is_semi_sync():
+            torch.cuda.synchronize()  # needed to avoid race condition
             # pyre-ignore [6]
             self.start_embedding_lookup(self.batches[1], self.contexts[1])
 
@@ -674,10 +688,26 @@ def _mlp_forward(
         self, batch: In, context: TrainPipelineContext
     ) -> Tuple[torch.Tensor, Out]:
         with record_function(f"## forward {context.index} ##"):
-            with torch.cuda.stream(self._overarch_stream):
-                _wait_for_event(batch, context.event)
-                context.event = None
-                return cast(Tuple[torch.Tensor, Out], self._model(batch))
+            _wait_for_event(batch, context.event)
+            context.event = None
+            return cast(Tuple[torch.Tensor, Out], self._model(batch))
+
+    def embedding_backward(self, context: EmbeddingTrainPipelineContext) -> None:
+        default_stream = torch.cuda.current_stream()
+        streams = (
+            self._embedding_even_streams
+            if cast(int, context.index) % 2 == 0
+            else self._embedding_odd_streams
+        )
+        for stream, emb_tensor, grad_tensor in zip(
+            streams,
+            context.embedding_tensors,
+            context.detached_embedding_tensors,
+        ):
+            with torch.cuda.stream(stream):
+                # pyre-ignore
+                stream.wait_stream(default_stream)
+                torch.autograd.backward(emb_tensor, grad_tensor.grad)
 
     def copy_batch_to_gpu(
         self,
@@ -722,15 +752,19 @@ def start_embedding_lookup(
         if batch is None:
             return
         with record_function(f"## start_embedding_lookup {context.index} ##"):
-            with torch.cuda.stream(
-                self._embedding_even_stream
-                if cast(int, context.index) % 2 == 0
-                else self._embedding_odd_stream
-            ):
-                _wait_for_event(batch, context.event)
-                _start_embedding_lookup(self._pipelined_modules, batch, context)
-                context.event = torch.cuda.Event()
-                context.event.record()
+            _wait_for_event(batch, context.event)
+            context.event = []
+            for i, module in enumerate(self._pipelined_modules):
+                with torch.cuda.stream(
+                    self._embedding_even_streams[i]
+                    if cast(int, context.index) % 2 == 0
+                    else self._embedding_odd_streams[i]
+                ):
+                    _start_embedding_lookup([module], batch, context)
+                    event = torch.cuda.Event()
+                    event.record()
+                    # pyre-ignore [16]
+                    context.event.append(event)
 
 
 class PrefetchTrainPipelineSparseDist(TrainPipelineSparseDist[In, Out]):
diff --git a/torchrec/distributed/train_pipeline/utils.py b/torchrec/distributed/train_pipeline/utils.py
@@ -96,7 +96,7 @@ class TrainPipelineContext:
     fused_splits_awaitables: List[Tuple[List[str], FusedKJTListSplitsAwaitable]] = (
         field(default_factory=list)
     )
-    event: Optional[torch.cuda.Event] = None
+    event: Optional[Union[List[torch.cuda.Event], torch.cuda.Event]] = None
     index: Optional[int] = None
     version: int = (
         0  # 1 is current version, 0 is deprecated but supported for backward compatibility
@@ -114,6 +114,8 @@ class PrefetchTrainPipelineContext(TrainPipelineContext):
 @dataclass
 class EmbeddingTrainPipelineContext(TrainPipelineContext):
     embedding_a2a_requests: Dict[str, Multistreamable] = field(default_factory=dict)
+    embedding_tensors: List[torch.Tensor] = field(default_factory=list)
+    detached_embedding_tensors: List[torch.Tensor] = field(default_factory=list)
 
 
 @dataclass
@@ -230,8 +232,25 @@ def __call__(self, *input, **kwargs) -> Awaitable:
         if self._stream is not None:
             torch.cuda.current_stream().wait_stream(self._stream)
             cur_stream = torch.cuda.current_stream()
-            ctx.record_stream(cur_stream)
-        return self._context.embedding_a2a_requests.pop(self._name)
+        awaitable = self._context.embedding_a2a_requests.pop(self._name)
+        embs = awaitable.wait()
+        if isinstance(embs, Dict):
+            for jt in embs.values():
+                tensor = jt.values()
+                new_tensor = tensor.detach().requires_grad_()
+                jt._values = new_tensor
+                # pyre-ignore [16]
+                self._context.embedding_tensors.append(tensor)
+                # pyre-ignore [16]
+                self._context.detached_embedding_tensors.append(new_tensor)
+        else:
+            tensor = embs.values()
+            new_tensor = tensor.detach().requires_grad_()
+            embs._values = new_tensor
+            self._context.embedding_tensors.append(tensor)
+            self._context.detached_embedding_tensors.append(new_tensor)
+
+        return embs
 
 
 class PrefetchPipelinedForward(BaseForward):
@@ -373,11 +392,18 @@ def _wait_for_batch(batch: In, stream: Optional[torch.cuda.streams.Stream]) -> N
     batch.record_stream(cur_stream)
 
 
-def _wait_for_event(batch: In, event: Optional[torch.cuda.Event]) -> None:
+def _wait_for_event(
+    batch: In, event: Optional[Union[List[torch.cuda.Event], torch.cuda.Event]]
+) -> None:
     """
     Wait for event
     """
-    if event is not None:
+
+    if event and isinstance(event, list):
+        for sub_event in event:
+            sub_event.wait()
+    elif event:
+        # pyre-ignore
         event.wait()
     cur_stream = torch.cuda.current_stream()