switch to new op for KeyedTensor.regroup (#2226)

TroyGarden · facebook-github-bot · commit ab3d1e922f29 · 2024-08-29T06:19:01.000-07:00
Summary: Pull Request resolved: #2226 # context * the new op `permute_multi_embedding` outperforms the original op `_fbgemm_permute_pooled_embs` * this diff makes the move to switch to the new op # benchmark * more results: D58907223, [traces](https://drive.google.com/drive/folders/1DEYozPihmij2zRAyG9AMxaIbcjTPWRVU?usp=drive_link) * previous prod {F1755206204} * new prod {F1755207013} * metrics |Operator|CPU runtime|GPU runtime|GPU memory|notes| |---|---|---|---|---| |**[fallback] pytorch generic**|3.9 ms|3.2 ms|1.0 K|CPU-bounded, allow duplicates| |**[previous prod] permute_pooled_embs**|1.9 ms|4.9 ms|1.5 K|GPU-boudned, does **NOT** allow duplicates, PT2 non-compatible `pin_and_move`| |**[new prod] permute_multi_embedding**|1.0 ms|2.0 ms|1.0 K|both CPU and GPU runtime/memory improved, **ALLOW** duplicates, PT2 friendly| NOTE: the new op takes in `List[List[str]]` and `List[List[int]]`, it currently does not support dynamic_shape and produces error like the following: > 1) SerializeError: Failed serializing node kt_regroup_permutes in graph: %kt_regroup_permutes : [num_users=3] = call_function[target=torch.ops.fbgemm.kt_regroup_permutes.default](args = (%ir_custom_op, [[f1], [f2]], [[3], [5]], [[f1], [f2]]), kwargs = {}) ... Caused by SerializeError: Unsupported list/tuple argument type: [<class 'torch.fx.immutable_collections.immutable_list'>, <class 'torch.fx.immutable_collections.immutable_list'>] Reviewed By: dstaay-fb Differential Revision: D55277833 fbshipit-source-id: be47179c62b2df48445c78eabf5d7d44582a495b
diff --git a/torchrec/ir/tests/test_serializer.py b/torchrec/ir/tests/test_serializer.py
@@ -352,6 +352,59 @@ def test_deserialized_device(self) -> None:
                     continue
                 assert param.device.type == device.type, f"{name} should be on {device}"
 
+    # pyre-ignore
+    @unittest.skipIf(
+        torch.cuda.device_count() <= 0,
+        "this test needs a GPU machine to run",
+    )
+    def test_deserialize_device_kt_regroup(self) -> None:
+        class Model(nn.Module):
+            def __init__(self, ebc):
+                super().__init__()
+                self.ebc = ebc
+
+            def forward(
+                self,
+                features: KeyedJaggedTensor,
+            ) -> List[torch.Tensor]:
+                kt = self.ebc(features)
+                return KeyedTensor.regroup([kt], [[key] for key in kt.keys()])
+
+        model = self.generate_model()
+        model = Model(model.ebc1)
+        id_list_features = KeyedJaggedTensor.from_offsets_sync(
+            keys=["f1", "f2", "f3"],
+            values=torch.tensor([0, 1, 2, 3, 2, 3]),
+            offsets=torch.tensor([0, 2, 2, 3, 4, 5, 6]),
+        )
+        eager_out = model(id_list_features)
+
+        # Serialize EBC
+        model, sparse_fqns = encapsulate_ir_modules(model, JsonSerializer)
+        ep = torch.export.export(
+            model,
+            (id_list_features,),
+            {},
+            strict=False,
+            # Allows KJT to not be unflattened and run a forward on unflattened EP
+            preserve_module_call_signature=(tuple(sparse_fqns)),
+        )
+        unflatten_model = torch.export.unflatten(ep)
+        deserialized_model = decapsulate_ir_modules(
+            unflatten_model, JsonSerializer, torch.device("cuda")
+        )
+        device = torch.device("cuda")
+        deserialized_model.to(device)
+        id_list_features = id_list_features.to(device)
+
+        deserialized_model.load_state_dict(model.state_dict())
+        # Run forward on deserialized model
+        deserialized_out = deserialized_model(id_list_features)
+
+        for i, tensor in enumerate(deserialized_out):
+            assert eager_out[i].shape == tensor.shape
+            assert torch.allclose(eager_out[i].to(tensor), tensor)
+
     def test_compound_module(self) -> None:
         tb1_config = EmbeddingBagConfig(
             name="t1",
diff --git a/torchrec/sparse/jagged_tensor.py b/torchrec/sparse/jagged_tensor.py
@@ -2753,11 +2753,7 @@ def to_dict(self) -> Dict[str, torch.Tensor]:
     def regroup(
         keyed_tensors: List["KeyedTensor"], groups: List[List[str]]
     ) -> List[torch.Tensor]:
-        # Fast path, one-to-one correspondence between keyed_tensors and groups
-        if _all_keys_used_once(keyed_tensors, groups) is True:
-            return _fbgemm_permute_pooled_embs(keyed_tensors, groups)
-        else:  # Fallback to slow path otherwise
-            return _regroup_keyed_tensors(keyed_tensors, groups)
+        return permute_multi_embedding(keyed_tensors, groups)
 
     @staticmethod
     def regroup_as_dict(