Fix dynamo tracing into AOTAutogradCache results in cpu tensors (pytorch#155251)

jamesjwu · thatgeeman · commit 3e8280845b87 · 2025-06-15T13:36:47.000+02:00
On this line, we see that the bw_compiler that dynamo uses for AotAutograd automatically disables the backward runnable: https://github.com/pytorch/pytorch/blob/05dd638ee98b36254c84095894c36fd0e7d95544/torch/_dynamo/backends/common.py#L76 This disables dynamo in the bw_compiler but also disables the runnable the compiler returns. On a AOTAutogradCache hit, however, we never call the bw_compiler! So we don't disable dynamo properly. This only has an effect on certain cases of cpu tensors' backwards, where the backward is being done in python land, and dynamo unnecessarily tries to trace through the inductor generated code. It also only matters if the backward is being accessed outside of dynamo itself (say, in a graph break in eager mode), since dynamo properly disables the forward function already. ``` I0605 09:58:40.135000 3981970 torch/_dynamo/eval_frame.py:517] TorchDynamo attempted to trace the following frames: [ I0605 09:58:40.135000 3981970 torch/_dynamo/eval_frame.py:517] * fn /home/jjwu/test.py:9 I0605 09:58:40.135000 3981970 torch/_dynamo/eval_frame.py:517] * cast /data/users/jjwu/a/pytorch-env/lib/python3.10/typing.py:1737 I0605 09:58:40.135000 3981970 torch/_dynamo/eval_frame.py:517] * call /tmp/torchinductor_jjwu/rq/crq327nhoyjzog5n3qlchauucdrunrtutwmmoh7ipoe2ngnson5s.py:35 I0605 09:58:40.135000 3981970 torch/_dynamo/eval_frame.py:517] * fn /home/jjwu/test.py:9 I0605 09:58:40.135000 3981970 torch/_dynamo/eval_frame.py:517] * cast /data/users/jjwu/a/pytorch-env/lib/python3.10/typing.py:1737 I0605 09:58:40.135000 3981970 torch/_dynamo/eval_frame.py:517] * call /tmp/torchinductor_jjwu/rq/crq327nhoyjzog5n3qlchauucdrunrtutwmmoh7ipoe2ngnson5s.py:35 I0605 09:58:40.135000 3981970 torch/_dynamo/eval_frame.py:517] ] ``` This PR fixes the issue and adds a unit test showing that with or without cache hit, the frames dynamo is tracing is identical. Fixes pytorch#154536 Pull Request resolved: pytorch#155251 Approved by: https://github.com/bdhirsh, https://github.com/anijain2305
diff --git a/test/dynamo/test_aot_autograd_cache.py b/test/dynamo/test_aot_autograd_cache.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: dynamo"]
 
+import copy
 import os
 import shutil
 import unittest
@@ -822,6 +823,44 @@ def fn(a, b):
         self.assertEqual(a.grad, a2.grad)
         self.assertEqual(b.grad, b2.grad)
 
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch({"fx_graph_cache": True})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @functorch_config.patch({"strict_autograd_cache": True})
+    def test_autograd_no_dynamo_trace_backward(self):
+        """
+        Test that dynamo does not trace into the backward compiled function,
+        even on cache hit.
+        """
+        torch._dynamo.eval_frame.clear_dynamo_tls()
+
+        @torch.compile
+        def fn(x):
+            # Calls x.sum().backward() during forward execution of fn
+            (x_grad,) = torch.autograd.grad(x.sum(), x)
+            return x_grad
+
+        a = torch.randn(10, 10, requires_grad=True, device="cpu")
+        result = fn(a)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        # Backward of `sum` will run during execution of graph break
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+        traced_frame_infos = copy.deepcopy(
+            torch._dynamo.eval_frame.dynamo_tls.traced_frame_infos
+        )
+
+        torch._dynamo.reset()
+        torch._dynamo.eval_frame.clear_dynamo_tls()
+        result2 = fn(a)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+        new_traced_frame_infos = torch._dynamo.eval_frame.dynamo_tls.traced_frame_infos
+        self.assertEqual(result, result2)
+        # Dynamo should trace exactly the same frames on cache hit
+        self.assertEqual(traced_frame_infos, new_traced_frame_infos)
+
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
diff --git a/torch/_dynamo/backends/common.py b/torch/_dynamo/backends/common.py
@@ -68,7 +68,10 @@ def __call__(self, gm: torch.fx.GraphModule, example_inputs, **kwargs):
 
         def wrap_bw_compiler(bw_compiler_fn):
             def _wrapped_bw_compiler(*args, **kwargs):
-                # stop TorchDynamo from trying to compile our generated backwards pass
+                # Note [Wrapping bw_compiler in disable]
+                # The two disables here:
+                # - stop TorchDynamo from trying to compile the bw_compiler function itself
+                # - stop TorchDynamo from trying to compile our the generated backwards pass bw_compiler produces
                 return disable(
                     disable(
                         bw_compiler_fn, reason="do not trace backward compiler function"
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -589,6 +589,15 @@ class CompiledBackward(GenericCompiledBackward[CompiledFxGraph], FxGraphCacheLoa
     def _is_backward(self) -> bool:
         return True
 
+    def post_compile(
+        self, result: CompiledFxGraph, fx_config: _CompileFxKwargs
+    ) -> CompiledFxGraph:
+        compiled_bw = super().post_compile(result, fx_config)
+        # See note [Wrapping bw_compiler in disable]
+        # This is done by _wrapped_bw_compiler in torch/_dynamo/backends/common.py
+        # But since on cache hit we do not call the bw_compiler, we need to reapply the disable
+        return torch._dynamo.disable(compiled_bw, reason="do not trace generated backwards pass")  # type: ignore[return-value]
+
 
 # Forward types don't have any extra parameters, so this is just a TypeAlias, in essence
 class BundledCompiledForward(CompiledFxGraphLoadable):
@@ -599,7 +608,14 @@ class BundledCompiledForward(CompiledFxGraphLoadable):
 class BundledCompiledBackward(
     GenericCompiledBackward[CompiledFxGraph], CompiledFxGraphLoadable
 ):
-    pass
+    def post_compile(
+        self, result: CompiledFxGraph, fx_config: _CompileFxKwargs
+    ) -> CompiledFxGraph:
+        compiled_bw = super().post_compile(result, fx_config)
+        # See note [Wrapping bw_compiler in disable]
+        # This is done by _wrapped_bw_compiler in torch/_dynamo/backends/common.py
+        # But since on cache hit we do not call the bw_compiler, we need to reapply the disable
+        return torch._dynamo.disable(compiled_bw, reason="do not trace generated backwards pass")  # type: ignore[return-value]
 
 
 TForward = TypeVar("TForward", bound=InductorOutput)