[dtensor] fix simplefsdp mixed-precision training bugs (#154975)

ruisizhang123 · pytorchmergebot · commit a1a268aff524 · 2025-06-03T14:47:36.000Z
This is a follow-up on the previous dtensor redistribute PR: #150740, which enables SimpleFSDP's mixed-precision training. In the most recent integration in TorchTitan: pytorch/torchtitan#1250, we found some discrepancies between SimpleFSDP's `fully_shard` and `replicate` modes when MPT is enabled. After debugging, I found the problem is in dtensor redistribute --`local_tensor` is taken out again from the original `input`. Thus, the dtensor used for communication has its original precision instead of using `forward_dtype`. This PR fixes this issue and corrects previously added test cases. After fixing the bug, the loss curves of `fully_shard` and `replicate` mode match perfectly. ![loss](https://github.com/user-attachments/assets/a8faddae-a476-48c0-a411-3fe04d2233bd) Pull Request resolved: #154975 Approved by: https://github.com/tianyu-l
diff --git a/test/distributed/tensor/test_redistribute.py b/test/distributed/tensor/test_redistribute.py
@@ -312,7 +312,9 @@ def test_shard_to_replicate_forward_backward_datatype_conversion(self):
                         backward_dtype=backward_dtype,
                     )
                 self.assertEqual(reshard_dtensor.size(), torch.Size(input_size))
-                self.assertEqual(expected_tensor, reshard_dtensor.to_local())
+                self.assertEqual(
+                    expected_tensor.to(forward_dtype), reshard_dtensor.to_local()
+                )
                 self.assertEqual(
                     comm_mode.get_comm_counts()[funcol.all_gather_into_tensor], 1
                 )
diff --git a/torch/distributed/tensor/_redistribute.py b/torch/distributed/tensor/_redistribute.py
@@ -318,7 +318,6 @@ def forward(  # type: ignore[override]
                 device_mesh, placements, tensor_meta=current_spec.tensor_meta
             )
 
-            local_tensor = input._local_tensor
             output = redistribute_local_tensor(
                 local_tensor, current_spec, target_spec, async_op=async_op
             )

Original file line number	Diff line number	Diff line change
`@@ -312,7 +312,9 @@ def test_shard_to_replicate_forward_backward_datatype_conversion(self):`
`312`	`312`	`backward_dtype=backward_dtype,`
`313`	`313`	`)`
`314`	`314`	`self.assertEqual(reshard_dtensor.size(), torch.Size(input_size))`
`315`		`- self.assertEqual(expected_tensor, reshard_dtensor.to_local())`
	`315`	`+ self.assertEqual(`
	`316`	`+ expected_tensor.to(forward_dtype), reshard_dtensor.to_local()`
	`317`	`+ )`
`316`	`318`	`self.assertEqual(`
`317`	`319`	`comm_mode.get_comm_counts()[funcol.all_gather_into_tensor], 1`
`318`	`320`	`)`
Original file line number	Diff line number	Diff line change
`@@ -318,7 +318,6 @@ def forward( # type: ignore[override]`
`318`	`318`	`device_mesh, placements, tensor_meta=current_spec.tensor_meta`
`319`	`319`	`)`
`320`	`320`
`321`		`- local_tensor = input._local_tensor`
`322`	`321`	`output = redistribute_local_tensor(`
`323`	`322`	`local_tensor, current_spec, target_spec, async_op=async_op`
`324`	`323`	`)`