From a2c3f35f047e6b711d3ccf4983856482563b1c13 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 06:01:16 -0700
Subject: [PATCH 01/83] enable fsdp cases based on local branch

---
 test/xpu/run_distributed_local.py | 63 +++++++++++++++++++++++++++++++
 test/xpu/skip_list_dist_local.py  | 57 ++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 test/xpu/run_distributed_local.py
 create mode 100644 test/xpu/skip_list_dist_local.py

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
new file mode 100644
index 000000000..8074b3292
--- /dev/null
+++ b/test/xpu/run_distributed_local.py
@@ -0,0 +1,63 @@
+import os
+import subprocess
+import sys
+
+from skip_list_dist_local import skip_dict
+from xpu_test_utils import launch_test
+
+res = 0
+fail_test = []
+
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
+if ret == 0:
+    gpu_dict = {}
+    with open("topology.log", "r") as file:
+        lines = file.readlines()
+        for line in lines:
+           if "CPU Affinity" in line:
+              continue
+           line = line.strip()
+           if line.startswith("GPU "):
+               items = line.split(' ')
+               items = [x for x in items if x]
+               gpu_id = items[1]
+               i = gpu_id.split('/')[0]
+               affinity = ""
+               for j, item in enumerate(items):
+                   if "SYS" not in item and ( "XL" in item or "S" in item ):
+                      if len(affinity) == 0:
+                          affinity = str(j-2)
+                      else:
+                          affinity = affinity + ',' + str(j-2)
+               gpu_dict[i] = affinity
+    
+    
+    max_affinity = ""
+    for key, value in gpu_dict.items():
+        if  len(value) > len(max_affinity):
+            max_affinity = value
+    
+    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
+    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
+
+else:
+    print("xpu-smi topology failed")
+    sys.exit(255)
+
+# run pytest with skiplist
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    fail = launch_test(key, skip_list)
+    res += fail
+    if fail:
+        fail_test.append(key)
+
+if fail_test:
+    print(",".join(fail_test) + " have failures")
+
+exit_code = os.WEXITSTATUS(res)
+if exit_code == 0:
+    sys.exit(res)
+else:
+    sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
new file mode 100644
index 000000000..08f90c6b5
--- /dev/null
+++ b/test/xpu/skip_list_dist_local.py
@@ -0,0 +1,57 @@
+skip_dict = {
+    "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1536
+    #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    #    "test_distributed_checkpoint_state_dict_type0_xpu",
+    #    "test_distributed_checkpoint_state_dict_type1_xpu",
+    #),
+    "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        "test_delayed_optim_step_offload_true_no_shard_xpu",
+        "test_transformer_no_grad_mixed_precision_True_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
+        "test_fsdp_zero2_eval_with_prefetch",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1537
+    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
+        "test_use_orig_params",
+    ),
+    # Performance check, skip 
+    #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    #    "test_forward_overlap",
+    #    "test_forward_overlap_xpu",
+    #),
+    "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None,
+    "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_shard_utils.py": None,
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+    "../../../../test/distributed/fsdp/test_wrap.py": None,
+}

From e772d23680c67301d6e9e5a47b741fc622c49158 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 19:46:24 -0700
Subject: [PATCH 02/83] add 2025.0 WA

---
 test/xpu/run_distributed_local.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 8074b3292..b6a9ef60c 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -8,6 +8,9 @@
 res = 0
 fail_test = []
 
+os.environ["CCL_ATL_TRANSPORT"] = "ofi"
+os.environ["CCL_SEND"] = "direct"
+os.environ["CCL_RECV"] = "direct" 
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:

From cbd34cd308e4cd601561c3ce64e44c408b94f730 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 01:27:44 -0700
Subject: [PATCH 03/83] Update distributed UT cases in DDP and PP

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 29 +++++++++-
 test/xpu/skip_list_dist_local.py  | 91 +++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index b6a9ef60c..982f05409 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -2,15 +2,17 @@
 import subprocess
 import sys
 
-from skip_list_dist_local import skip_dict
+from skip_list_dist_local import skip_dict, skip_dict_python
 from xpu_test_utils import launch_test
 
 res = 0
+res2 = 0
 fail_test = []
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
 os.environ["CCL_RECV"] = "direct" 
+os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:
@@ -48,6 +50,29 @@
     print("xpu-smi topology failed")
     sys.exit(255)
 
+# run python test
+def run(test_command):
+    result = subprocess.run(test_command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    return result
+
+for key in skip_dict_python:
+    skip_list = skip_dict_python[key]
+    test_command = ["python", key]
+    fail = run(test_command)
+    if fail.returncode:
+        for line in fail.stderr.split("\n"):
+            if "FAIL: " in line:
+                is_error = True
+                for skip_case in skip_list:
+                    if skip_case in line:
+                        print("Skiped error: ", key + " " + skip_case)
+                        is_error = False
+                if is_error:
+                    res2 += fail.returncode
+                    fail_test.append("".join(key + " " + line))
+
 # run pytest with skiplist
 for key in skip_dict:
     skip_list = skip_dict[key]
@@ -61,6 +86,6 @@
 
 exit_code = os.WEXITSTATUS(res)
 if exit_code == 0:
-    sys.exit(res)
+    sys.exit(res2)
 else:
     sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 08f90c6b5..d65b7aee6 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -54,4 +54,95 @@
     "../../../../test/distributed/fsdp/test_shard_utils.py": None,
     "../../../../test/distributed/fsdp/test_utils.py": None,
     "../../../../test/distributed/fsdp/test_wrap.py": None,
+    "../../../../test/distributed/test_backends.py": None,
+    "../../../../test/distributed/test_c10d_common.py": None,
+    "../../../../test/distributed/test_c10d_functional_native.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        "test_reduce_scatter_tensor_coalesced",
+        "test_reduce_scatter_tensor_single",
+        # https://github.com/intel/torch-xpu-ops/issues/1525
+        # ValueError: trying to initialize the default process group twice!
+        "test_inductor_all_gather_into_tensor_coalesced",
+        "test_inductor_all_gather_into_tensor_single",
+        "test_inductor_all_reduce_coalesced",
+        "test_inductor_all_reduce_non_contig_input",
+        "test_inductor_all_reduce_single",
+        "test_inductor_all_to_all_single",
+        "test_inductor_broadcast",
+        "test_inductor_inplace_op_on_view",
+        "test_inductor_reduce_scatter_tensor_coalesced",
+        "test_inductor_reduce_scatter_tensor_single",
+        "test_inductor_reuse_buffer_after_inplace_collective",
+        "test_ranks_and_tag",
+        "test_wait_tensor",
+    ),
+    "../../../../test/distributed/test_c10d_logger.py": None,
+    "../../../../test/distributed/test_c10d_object_collectives.py": (
+        # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_gather_object_cpu",
+        "test_gather_object_xpu",
+        "test_gather_object_list_cpu",
+        "test_gather_object_list_xpu",
+    ),
+    "../../../../test/distributed/test_compute_comm_reordering.py": None,
+    "../../../../test/distributed/test_control_collectives.py": None,
+    "../../../../test/distributed/test_device_mesh.py": None,
+    "../../../../test/distributed/test_dynamo_distributed.py": (
+        # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout'
+        "test_asymmetric_compilation",
+        "test_asymmetric_compilation_with_fx_cache",
+        # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
+        "test_compiled_flex_attention_full_model_ddp",
+        "test_compiled_flex_attention_local_ddp",
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ 
+        # https://github.com/intel/torch-xpu-ops/issues/1527
+        "test_compiler_collectives_automatic_dynamic_scalar",
+        "test_compiler_collectives_automatic_dynamic_speculation_divergence",
+        "test_compiler_collectives_automatic_dynamic_tensor",
+        "test_compiler_collectives_dim_mismatch",
+        "test_compiler_collectives_graph_break_empty_graph_still_collective",
+        "test_compiler_collectives_missing_source",
+        "test_compiler_collectives_scalar_missing_source",
+        "test_compiler_collectives_type_mismatch",
+        "test_ddp_activation_checkpointing",
+        "test_ddp_baseline_aot_eager_multiprocess",
+        "test_fsdp_activation_checkpointing",
+        "test_fsdp_aot_eager",
+        "test_fsdp_inductor",
+        "test_fsdp_setattr",
+        "test_fsdp_unspecialized_forced_getattr_inline",
+        "test_fsdp_unspecialized_forced_getattr_no_inline",
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_get_pg_attr",
+    ),
+    "../../../../test/distributed/test_fake_pg.py": None,
+    "../../../../test/distributed/test_functional_api.py": (
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_tracing_xpu",
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu"
+    ),
+    "../../../../test/distributed/test_multi_threaded_pg.py": (
+        # oneccl not support multi-threaded well, so skip it first.
+        "test_bwd_sees_fwd_pg",
+    ),
+    "../../../../test/distributed/test_store.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    "../../../../test/distributed/pipelining/test_microbatch.py": None,
+    "../../../../test/distributed/pipelining/test_pipe.py": None,
+    "../../../../test/distributed/pipelining/test_schedule.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
+    "../../../../test/distributed/pipelining/test_unflatten.py": None,
+}
+
+skip_dict_python = {
+    "distributed/test_c10d_ops_xccl.py": None,
+    "distributed/test_c10d_xccl.py": None,
+    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error.
+    "../../../../test/distributed/pipelining/test_stage.py": None,
 }

From d856e950310ed44446d81d9b37250b7b7d4fbcc3 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 01:36:16 -0700
Subject: [PATCH 04/83] Fixed pylint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 7 +++----
 test/xpu/skip_list_dist_local.py  | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 982f05409..a5f0c8098 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -11,7 +11,7 @@
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
-os.environ["CCL_RECV"] = "direct" 
+os.environ["CCL_RECV"] = "direct"
 os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
@@ -36,13 +36,12 @@
                       else:
                           affinity = affinity + ',' + str(j-2)
                gpu_dict[i] = affinity
-    
-    
+
     max_affinity = ""
     for key, value in gpu_dict.items():
         if  len(value) > len(max_affinity):
             max_affinity = value
-    
+
     os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
     print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
 
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index d65b7aee6..6ce62b8ca 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -96,7 +96,7 @@
         # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
         "test_compiled_flex_attention_full_model_ddp",
         "test_compiled_flex_attention_local_ddp",
-        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ 
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__
         # https://github.com/intel/torch-xpu-ops/issues/1527
         "test_compiler_collectives_automatic_dynamic_scalar",
         "test_compiler_collectives_automatic_dynamic_speculation_divergence",
@@ -131,13 +131,13 @@
     ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
     "../../../../test/distributed/pipelining/test_unflatten.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
 }
 
 skip_dict_python = {

From 28a259e59448bb70958a818d3f50fee62f2ebfa2 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 02:01:55 -0700
Subject: [PATCH 05/83] Fixed pylint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 38 ++++++++++++++++---------------
 test/xpu/skip_list_dist_local.py  | 17 +++++++-------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index a5f0c8098..d4db4785a 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -17,29 +17,29 @@
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:
     gpu_dict = {}
-    with open("topology.log", "r") as file:
+    with open("topology.log") as file:
         lines = file.readlines()
         for line in lines:
-           if "CPU Affinity" in line:
-              continue
-           line = line.strip()
-           if line.startswith("GPU "):
-               items = line.split(' ')
-               items = [x for x in items if x]
-               gpu_id = items[1]
-               i = gpu_id.split('/')[0]
-               affinity = ""
-               for j, item in enumerate(items):
-                   if "SYS" not in item and ( "XL" in item or "S" in item ):
-                      if len(affinity) == 0:
-                          affinity = str(j-2)
-                      else:
-                          affinity = affinity + ',' + str(j-2)
-               gpu_dict[i] = affinity
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
 
     max_affinity = ""
     for key, value in gpu_dict.items():
-        if  len(value) > len(max_affinity):
+        if len(value) > len(max_affinity):
             max_affinity = value
 
     os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
@@ -49,6 +49,7 @@
     print("xpu-smi topology failed")
     sys.exit(255)
 
+
 # run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
@@ -56,6 +57,7 @@ def run(test_command):
     print(result.stderr)
     return result
 
+
 for key in skip_dict_python:
     skip_list = skip_dict_python[key]
     test_command = ["python", key]
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 6ce62b8ca..0ac46961e 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -1,10 +1,10 @@
 skip_dict = {
     "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
     # https://github.com/intel/torch-xpu-ops/issues/1536
-    #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
     #    "test_distributed_checkpoint_state_dict_type0_xpu",
     #    "test_distributed_checkpoint_state_dict_type1_xpu",
-    #),
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
@@ -37,11 +37,11 @@
     "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
         "test_use_orig_params",
     ),
-    # Performance check, skip 
-    #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    # Performance check, skip
+    # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
     #    "test_forward_overlap",
     #    "test_forward_overlap_xpu",
-    #),
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
@@ -58,7 +58,7 @@
     "../../../../test/distributed/test_c10d_common.py": None,
     "../../../../test/distributed/test_c10d_functional_native.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1508
-        #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
         "test_reduce_scatter_tensor_coalesced",
         "test_reduce_scatter_tensor_single",
         # https://github.com/intel/torch-xpu-ops/issues/1525
@@ -123,7 +123,7 @@
         # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
         # https://github.com/intel/torch-xpu-ops/issues/1526
         "test_tracing_xpu",
-        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu"
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu",
     ),
     "../../../../test/distributed/test_multi_threaded_pg.py": (
         # oneccl not support multi-threaded well, so skip it first.
@@ -131,7 +131,6 @@
     ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/pipelining/test_backward.py": None,
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
@@ -143,6 +142,6 @@
 skip_dict_python = {
     "distributed/test_c10d_ops_xccl.py": None,
     "distributed/test_c10d_xccl.py": None,
-    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error.
+    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
 }

From 62e9ff75ced8a311c1e52c61fd49c97622075378 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:25:50 +0000
Subject: [PATCH 06/83] add distributed ut in CI

---
 .github/scripts/ut_result_check.sh |  10 +--
 .github/workflows/_linux_build.yml |   6 +-
 .github/workflows/_linux_ut.yml    | 140 +++++++++++++++++++++++++++++
 .github/workflows/pull.yml         |  25 ++++++
 4 files changed, 175 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 3fb1a1997..32dbed489 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
-if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
-    grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
-    num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log")
+if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
+    grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite} xpu distributed"
     echo -e "========================================================================="
-    cat "./${ut_suite}_xpu_distributed_test_failed.log"
-    ((num_failed=num_failed_xpu_distributed))
+    cat "./${ut_suite}_test_failed.log"
+    ((num_failed=num_failed_distributed))
     if [[ $num_failed -gt 0 ]]; then
       echo -e "[ERROR] UT ${ut_suite} test Fail"
       exit 1
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index ee9381c9c..3ed1c3d4e 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -70,7 +70,11 @@ jobs:
           source activate xpu_build
           cd ../ && rm -rf pytorch
           pip install requests
-          git clone https://github.com/pytorch/pytorch pytorch
+          if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
+            git clone https://github.com/daisyden/pytorch.git pytorch
+          else
+            git clone https://github.com/pytorch/pytorch pytorch
+          fi
           cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
           # apply PRs for stock pytorch
           python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index a11528a3e..aa631c6dd 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -435,3 +435,143 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-xpu_distributed
           path: ${{ github.workspace }}/ut_log
+
+  pytorch_distributed_test:
+    runs-on: ${{ inputs.runner }}
+    if: contains(inputs.ut, 'pytorch_distributed')
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/daisyden/pytorch.git pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Triton Installation
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          if [ -z ${{ inputs.triton }} ]; then
+            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
+          else
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          fi
+          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
+          if [[ ${{ inputs.abi }} == '0' ]]; then
+            export _GLIBCXX_USE_CXX11_ABI=0
+          else
+            export _GLIBCXX_USE_CXX11_ABI=1
+          fi
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
+            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Distributed UT
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install pytest
+          cd ${{ github.workspace }}
+          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
+          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
+          mkdir -p ut_log/pytorch_distributed
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
+            echo -e "[ERROR] XCCL is not enabled"
+            exit 1
+          fi
+          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          cd ${{ github.workspace }}
+          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: UT Test Results Check
+        shell: bash
+        run: |
+          function contains() {
+              contains_status="echo 'Start $2 ...'"
+              {
+                [[ $1 =~ (^|,)$2($|,) ]]
+              } || {
+                echo "[Warning] $2 is not suppotted type! Skipped!"
+                contains_status="continue"
+              }
+          }
+          set -xe
+          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          cd ${{ github.workspace }}/ut_log/pytorch_distributed
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          bash ut_result_check.sh 'pytorch_distributed'
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index ec2a73a20..9cf7ef458 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -66,6 +66,31 @@ jobs:
       pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
       ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed
       runner: linux.idc.xpu
+  
+  preci-linux-build-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-lint-check
+    permissions:
+      issues: write
+    uses: ./.github/workflows/_linux_build.yml
+    with:
+      pytorch: distributed_2.8
+      runner: pvc_e2e
+
+  preci-ut-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-linux-build-distributed
+    uses: ./.github/workflows/_linux_ut.yml
+    with:
+      pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
+      ut: pytorch_distributed
+      runner: pvc_e2e
 
   Inductor-XPU-E2E-CI-Tests:
     name: preci-linux / e2e_test

From 119d2fb5b20a32990eeb0377ce490f2fe3f89894 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:52:17 +0000
Subject: [PATCH 07/83] update if condition

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++++------------
 .github/workflows/_linux_ut.yml    |  2 +-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 3ed1c3d4e..eda5de367 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -72,20 +72,22 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-          fi
-          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-          # apply PRs for stock pytorch
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git status && git show -s
-          git submodule sync && git submodule update --init --recursive
-          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-            echo "Don't replace torch-xpu-ops!"
-          else
-            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-            # Workaround for torch-xpu-ops ci test
-            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
           fi
       - name: Build Pytorch XPU
         run: |
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index aa631c6dd..907c5cd2a 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -49,7 +49,7 @@ permissions: read-all
 jobs:
   ut_test:
     runs-on: ${{ inputs.runner }} 
-    if: ${{ inputs.ut != 'xpu_distributed' }}
+    if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }}
     timeout-minutes: 900
     env:
       NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}

From 5ff20baae6dba5dee9d6c2ea83773a436229e299 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 23:02:20 +0000
Subject: [PATCH 08/83] keep_torch_xpu_ops

---
 .github/workflows/pull.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 9cf7ef458..f0b1b8e22 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -78,6 +78,7 @@ jobs:
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: distributed_2.8
+      keep_torch_xpu_ops: true
       runner: pvc_e2e
 
   preci-ut-distributed:

From cc472d7823415596734eb9c7e7afb0a3b8c7203b Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sun, 6 Apr 2025 19:24:08 +0000
Subject: [PATCH 09/83] update keyword in distributed ut check

---
 .github/scripts/ut_result_check.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 32dbed489..9bf611786 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -73,10 +73,10 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
     fi
 fi
 if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
-    grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
     num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     echo -e "========================================================================="
-    echo -e "Show Failed cases in ${ut_suite} xpu distributed"
+    echo -e "Show Failed cases in ${ut_suite}"
     echo -e "========================================================================="
     cat "./${ut_suite}_test_failed.log"
     ((num_failed=num_failed_distributed))

From 60dbd6eb19a407058eb5f1e6c4972df7fed94fe1 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 13:37:10 +0000
Subject: [PATCH 10/83] update pytorch build

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index eda5de367..3ed1c3d4e 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -72,22 +72,20 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            # apply PRs for stock pytorch
-            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-            git status && git show -s
-            git submodule sync && git submodule update --init --recursive
-            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-              echo "Don't replace torch-xpu-ops!"
-            else
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-              # Workaround for torch-xpu-ops ci test
-              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-            fi
+          fi
+          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+          # apply PRs for stock pytorch
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
+          else
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
           fi
       - name: Build Pytorch XPU
         run: |

From af0bca95baf745631876e918dfd4ab6b6823778c Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 06:01:16 -0700
Subject: [PATCH 11/83] enable fsdp cases based on local branch

---
 test/xpu/run_distributed_local.py | 63 +++++++++++++++++++++++++++++++
 test/xpu/skip_list_dist_local.py  | 57 ++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 test/xpu/run_distributed_local.py
 create mode 100644 test/xpu/skip_list_dist_local.py

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
new file mode 100644
index 000000000..8074b3292
--- /dev/null
+++ b/test/xpu/run_distributed_local.py
@@ -0,0 +1,63 @@
+import os
+import subprocess
+import sys
+
+from skip_list_dist_local import skip_dict
+from xpu_test_utils import launch_test
+
+res = 0
+fail_test = []
+
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
+if ret == 0:
+    gpu_dict = {}
+    with open("topology.log", "r") as file:
+        lines = file.readlines()
+        for line in lines:
+           if "CPU Affinity" in line:
+              continue
+           line = line.strip()
+           if line.startswith("GPU "):
+               items = line.split(' ')
+               items = [x for x in items if x]
+               gpu_id = items[1]
+               i = gpu_id.split('/')[0]
+               affinity = ""
+               for j, item in enumerate(items):
+                   if "SYS" not in item and ( "XL" in item or "S" in item ):
+                      if len(affinity) == 0:
+                          affinity = str(j-2)
+                      else:
+                          affinity = affinity + ',' + str(j-2)
+               gpu_dict[i] = affinity
+    
+    
+    max_affinity = ""
+    for key, value in gpu_dict.items():
+        if  len(value) > len(max_affinity):
+            max_affinity = value
+    
+    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
+    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
+
+else:
+    print("xpu-smi topology failed")
+    sys.exit(255)
+
+# run pytest with skiplist
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    fail = launch_test(key, skip_list)
+    res += fail
+    if fail:
+        fail_test.append(key)
+
+if fail_test:
+    print(",".join(fail_test) + " have failures")
+
+exit_code = os.WEXITSTATUS(res)
+if exit_code == 0:
+    sys.exit(res)
+else:
+    sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
new file mode 100644
index 000000000..08f90c6b5
--- /dev/null
+++ b/test/xpu/skip_list_dist_local.py
@@ -0,0 +1,57 @@
+skip_dict = {
+    "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1536
+    #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    #    "test_distributed_checkpoint_state_dict_type0_xpu",
+    #    "test_distributed_checkpoint_state_dict_type1_xpu",
+    #),
+    "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        "test_delayed_optim_step_offload_true_no_shard_xpu",
+        "test_transformer_no_grad_mixed_precision_True_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
+        "test_fsdp_zero2_eval_with_prefetch",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1537
+    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
+        "test_use_orig_params",
+    ),
+    # Performance check, skip 
+    #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    #    "test_forward_overlap",
+    #    "test_forward_overlap_xpu",
+    #),
+    "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None,
+    "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_shard_utils.py": None,
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+    "../../../../test/distributed/fsdp/test_wrap.py": None,
+}

From 6885a00cdf79029a72ff85938bdf330937ada7e4 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 19:46:24 -0700
Subject: [PATCH 12/83] add 2025.0 WA

---
 test/xpu/run_distributed_local.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 8074b3292..b6a9ef60c 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -8,6 +8,9 @@
 res = 0
 fail_test = []
 
+os.environ["CCL_ATL_TRANSPORT"] = "ofi"
+os.environ["CCL_SEND"] = "direct"
+os.environ["CCL_RECV"] = "direct" 
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:

From cd013d7882b28620cf0b81aace3f212bcbedaca9 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 01:27:44 -0700
Subject: [PATCH 13/83] Update distributed UT cases in DDP and PP

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 29 +++++++++-
 test/xpu/skip_list_dist_local.py  | 91 +++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index b6a9ef60c..982f05409 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -2,15 +2,17 @@
 import subprocess
 import sys
 
-from skip_list_dist_local import skip_dict
+from skip_list_dist_local import skip_dict, skip_dict_python
 from xpu_test_utils import launch_test
 
 res = 0
+res2 = 0
 fail_test = []
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
 os.environ["CCL_RECV"] = "direct" 
+os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:
@@ -48,6 +50,29 @@
     print("xpu-smi topology failed")
     sys.exit(255)
 
+# run python test
+def run(test_command):
+    result = subprocess.run(test_command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    return result
+
+for key in skip_dict_python:
+    skip_list = skip_dict_python[key]
+    test_command = ["python", key]
+    fail = run(test_command)
+    if fail.returncode:
+        for line in fail.stderr.split("\n"):
+            if "FAIL: " in line:
+                is_error = True
+                for skip_case in skip_list:
+                    if skip_case in line:
+                        print("Skiped error: ", key + " " + skip_case)
+                        is_error = False
+                if is_error:
+                    res2 += fail.returncode
+                    fail_test.append("".join(key + " " + line))
+
 # run pytest with skiplist
 for key in skip_dict:
     skip_list = skip_dict[key]
@@ -61,6 +86,6 @@
 
 exit_code = os.WEXITSTATUS(res)
 if exit_code == 0:
-    sys.exit(res)
+    sys.exit(res2)
 else:
     sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 08f90c6b5..d65b7aee6 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -54,4 +54,95 @@
     "../../../../test/distributed/fsdp/test_shard_utils.py": None,
     "../../../../test/distributed/fsdp/test_utils.py": None,
     "../../../../test/distributed/fsdp/test_wrap.py": None,
+    "../../../../test/distributed/test_backends.py": None,
+    "../../../../test/distributed/test_c10d_common.py": None,
+    "../../../../test/distributed/test_c10d_functional_native.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        "test_reduce_scatter_tensor_coalesced",
+        "test_reduce_scatter_tensor_single",
+        # https://github.com/intel/torch-xpu-ops/issues/1525
+        # ValueError: trying to initialize the default process group twice!
+        "test_inductor_all_gather_into_tensor_coalesced",
+        "test_inductor_all_gather_into_tensor_single",
+        "test_inductor_all_reduce_coalesced",
+        "test_inductor_all_reduce_non_contig_input",
+        "test_inductor_all_reduce_single",
+        "test_inductor_all_to_all_single",
+        "test_inductor_broadcast",
+        "test_inductor_inplace_op_on_view",
+        "test_inductor_reduce_scatter_tensor_coalesced",
+        "test_inductor_reduce_scatter_tensor_single",
+        "test_inductor_reuse_buffer_after_inplace_collective",
+        "test_ranks_and_tag",
+        "test_wait_tensor",
+    ),
+    "../../../../test/distributed/test_c10d_logger.py": None,
+    "../../../../test/distributed/test_c10d_object_collectives.py": (
+        # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_gather_object_cpu",
+        "test_gather_object_xpu",
+        "test_gather_object_list_cpu",
+        "test_gather_object_list_xpu",
+    ),
+    "../../../../test/distributed/test_compute_comm_reordering.py": None,
+    "../../../../test/distributed/test_control_collectives.py": None,
+    "../../../../test/distributed/test_device_mesh.py": None,
+    "../../../../test/distributed/test_dynamo_distributed.py": (
+        # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout'
+        "test_asymmetric_compilation",
+        "test_asymmetric_compilation_with_fx_cache",
+        # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
+        "test_compiled_flex_attention_full_model_ddp",
+        "test_compiled_flex_attention_local_ddp",
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ 
+        # https://github.com/intel/torch-xpu-ops/issues/1527
+        "test_compiler_collectives_automatic_dynamic_scalar",
+        "test_compiler_collectives_automatic_dynamic_speculation_divergence",
+        "test_compiler_collectives_automatic_dynamic_tensor",
+        "test_compiler_collectives_dim_mismatch",
+        "test_compiler_collectives_graph_break_empty_graph_still_collective",
+        "test_compiler_collectives_missing_source",
+        "test_compiler_collectives_scalar_missing_source",
+        "test_compiler_collectives_type_mismatch",
+        "test_ddp_activation_checkpointing",
+        "test_ddp_baseline_aot_eager_multiprocess",
+        "test_fsdp_activation_checkpointing",
+        "test_fsdp_aot_eager",
+        "test_fsdp_inductor",
+        "test_fsdp_setattr",
+        "test_fsdp_unspecialized_forced_getattr_inline",
+        "test_fsdp_unspecialized_forced_getattr_no_inline",
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_get_pg_attr",
+    ),
+    "../../../../test/distributed/test_fake_pg.py": None,
+    "../../../../test/distributed/test_functional_api.py": (
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_tracing_xpu",
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu"
+    ),
+    "../../../../test/distributed/test_multi_threaded_pg.py": (
+        # oneccl not support multi-threaded well, so skip it first.
+        "test_bwd_sees_fwd_pg",
+    ),
+    "../../../../test/distributed/test_store.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    "../../../../test/distributed/pipelining/test_microbatch.py": None,
+    "../../../../test/distributed/pipelining/test_pipe.py": None,
+    "../../../../test/distributed/pipelining/test_schedule.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
+    "../../../../test/distributed/pipelining/test_unflatten.py": None,
+}
+
+skip_dict_python = {
+    "distributed/test_c10d_ops_xccl.py": None,
+    "distributed/test_c10d_xccl.py": None,
+    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error.
+    "../../../../test/distributed/pipelining/test_stage.py": None,
 }

From cd92f232de04270a17571df0989be7f32f679fcf Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 01:36:16 -0700
Subject: [PATCH 14/83] Fixed pylint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 7 +++----
 test/xpu/skip_list_dist_local.py  | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 982f05409..a5f0c8098 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -11,7 +11,7 @@
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
-os.environ["CCL_RECV"] = "direct" 
+os.environ["CCL_RECV"] = "direct"
 os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
@@ -36,13 +36,12 @@
                       else:
                           affinity = affinity + ',' + str(j-2)
                gpu_dict[i] = affinity
-    
-    
+
     max_affinity = ""
     for key, value in gpu_dict.items():
         if  len(value) > len(max_affinity):
             max_affinity = value
-    
+
     os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
     print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
 
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index d65b7aee6..6ce62b8ca 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -96,7 +96,7 @@
         # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
         "test_compiled_flex_attention_full_model_ddp",
         "test_compiled_flex_attention_local_ddp",
-        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ 
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__
         # https://github.com/intel/torch-xpu-ops/issues/1527
         "test_compiler_collectives_automatic_dynamic_scalar",
         "test_compiler_collectives_automatic_dynamic_speculation_divergence",
@@ -131,13 +131,13 @@
     ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
     "../../../../test/distributed/pipelining/test_unflatten.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
 }
 
 skip_dict_python = {

From 413c2b09b48eba42bfc67ed70fb03973edef50a5 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 02:01:55 -0700
Subject: [PATCH 15/83] Fixed pylint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 38 ++++++++++++++++---------------
 test/xpu/skip_list_dist_local.py  | 17 +++++++-------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index a5f0c8098..d4db4785a 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -17,29 +17,29 @@
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:
     gpu_dict = {}
-    with open("topology.log", "r") as file:
+    with open("topology.log") as file:
         lines = file.readlines()
         for line in lines:
-           if "CPU Affinity" in line:
-              continue
-           line = line.strip()
-           if line.startswith("GPU "):
-               items = line.split(' ')
-               items = [x for x in items if x]
-               gpu_id = items[1]
-               i = gpu_id.split('/')[0]
-               affinity = ""
-               for j, item in enumerate(items):
-                   if "SYS" not in item and ( "XL" in item or "S" in item ):
-                      if len(affinity) == 0:
-                          affinity = str(j-2)
-                      else:
-                          affinity = affinity + ',' + str(j-2)
-               gpu_dict[i] = affinity
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
 
     max_affinity = ""
     for key, value in gpu_dict.items():
-        if  len(value) > len(max_affinity):
+        if len(value) > len(max_affinity):
             max_affinity = value
 
     os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
@@ -49,6 +49,7 @@
     print("xpu-smi topology failed")
     sys.exit(255)
 
+
 # run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
@@ -56,6 +57,7 @@ def run(test_command):
     print(result.stderr)
     return result
 
+
 for key in skip_dict_python:
     skip_list = skip_dict_python[key]
     test_command = ["python", key]
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 6ce62b8ca..0ac46961e 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -1,10 +1,10 @@
 skip_dict = {
     "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
     # https://github.com/intel/torch-xpu-ops/issues/1536
-    #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
     #    "test_distributed_checkpoint_state_dict_type0_xpu",
     #    "test_distributed_checkpoint_state_dict_type1_xpu",
-    #),
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
@@ -37,11 +37,11 @@
     "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
         "test_use_orig_params",
     ),
-    # Performance check, skip 
-    #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    # Performance check, skip
+    # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
     #    "test_forward_overlap",
     #    "test_forward_overlap_xpu",
-    #),
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
@@ -58,7 +58,7 @@
     "../../../../test/distributed/test_c10d_common.py": None,
     "../../../../test/distributed/test_c10d_functional_native.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1508
-        #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
         "test_reduce_scatter_tensor_coalesced",
         "test_reduce_scatter_tensor_single",
         # https://github.com/intel/torch-xpu-ops/issues/1525
@@ -123,7 +123,7 @@
         # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
         # https://github.com/intel/torch-xpu-ops/issues/1526
         "test_tracing_xpu",
-        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu"
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu",
     ),
     "../../../../test/distributed/test_multi_threaded_pg.py": (
         # oneccl not support multi-threaded well, so skip it first.
@@ -131,7 +131,6 @@
     ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/pipelining/test_backward.py": None,
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
@@ -143,6 +142,6 @@
 skip_dict_python = {
     "distributed/test_c10d_ops_xccl.py": None,
     "distributed/test_c10d_xccl.py": None,
-    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error.
+    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
 }

From ab68eeef12b5546c9d5ff7000b222442ce88ca3f Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:25:50 +0000
Subject: [PATCH 16/83] add distributed ut in CI

---
 .github/scripts/ut_result_check.sh |  10 +--
 .github/workflows/_linux_build.yml |   6 +-
 .github/workflows/_linux_ut.yml    | 140 +++++++++++++++++++++++++++++
 .github/workflows/pull.yml         |  25 ++++++
 4 files changed, 175 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 3fb1a1997..32dbed489 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
-if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
-    grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
-    num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log")
+if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
+    grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite} xpu distributed"
     echo -e "========================================================================="
-    cat "./${ut_suite}_xpu_distributed_test_failed.log"
-    ((num_failed=num_failed_xpu_distributed))
+    cat "./${ut_suite}_test_failed.log"
+    ((num_failed=num_failed_distributed))
     if [[ $num_failed -gt 0 ]]; then
       echo -e "[ERROR] UT ${ut_suite} test Fail"
       exit 1
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index b67be9f29..f17d02a0c 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -65,7 +65,11 @@ jobs:
           source activate xpu_build
           cd ../ && rm -rf pytorch
           pip install requests
-          git clone https://github.com/pytorch/pytorch pytorch
+          if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
+            git clone https://github.com/daisyden/pytorch.git pytorch
+          else
+            git clone https://github.com/pytorch/pytorch pytorch
+          fi
           cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
           # apply PRs for stock pytorch
           python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index e2e21bbfb..1edd00a7c 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -420,3 +420,143 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed
           path: ${{ github.workspace }}/ut_log
+
+  pytorch_distributed_test:
+    runs-on: ${{ inputs.runner }}
+    if: contains(inputs.ut, 'pytorch_distributed')
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/daisyden/pytorch.git pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Triton Installation
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          if [ -z ${{ inputs.triton }} ]; then
+            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
+          else
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          fi
+          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
+          if [[ ${{ inputs.abi }} == '0' ]]; then
+            export _GLIBCXX_USE_CXX11_ABI=0
+          else
+            export _GLIBCXX_USE_CXX11_ABI=1
+          fi
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
+            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Distributed UT
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install pytest
+          cd ${{ github.workspace }}
+          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
+          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
+          mkdir -p ut_log/pytorch_distributed
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
+            echo -e "[ERROR] XCCL is not enabled"
+            exit 1
+          fi
+          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          cd ${{ github.workspace }}
+          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: UT Test Results Check
+        shell: bash
+        run: |
+          function contains() {
+              contains_status="echo 'Start $2 ...'"
+              {
+                [[ $1 =~ (^|,)$2($|,) ]]
+              } || {
+                echo "[Warning] $2 is not suppotted type! Skipped!"
+                contains_status="continue"
+              }
+          }
+          set -xe
+          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          cd ${{ github.workspace }}/ut_log/pytorch_distributed
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          bash ut_result_check.sh 'pytorch_distributed'
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3dd204e32..be9d35397 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -66,6 +66,31 @@ jobs:
       pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
       ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed
       runner: linux.idc.xpu
+  
+  preci-linux-build-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-lint-check
+    permissions:
+      issues: write
+    uses: ./.github/workflows/_linux_build.yml
+    with:
+      pytorch: distributed_2.8
+      runner: pvc_e2e
+
+  preci-ut-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-linux-build-distributed
+    uses: ./.github/workflows/_linux_ut.yml
+    with:
+      pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
+      ut: pytorch_distributed
+      runner: pvc_e2e
 
   Inductor-XPU-E2E-CI-Tests:
     name: preci-linux / e2e_test

From c5ec1405e405404d2f3f991d8ffbc213f6f2da5a Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:52:17 +0000
Subject: [PATCH 17/83] update if condition

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++++------------
 .github/workflows/_linux_ut.yml    |  2 +-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index f17d02a0c..e31d1e27b 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -67,20 +67,22 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-          fi
-          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-          # apply PRs for stock pytorch
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git status && git show -s
-          git submodule sync && git submodule update --init --recursive
-          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-            echo "Don't replace torch-xpu-ops!"
-          else
-            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-            # Workaround for torch-xpu-ops ci test
-            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
           fi
       - name: Build Pytorch XPU
         run: |
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 1edd00a7c..94dacaf54 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -44,7 +44,7 @@ permissions: read-all
 jobs:
   ut_test:
     runs-on: ${{ inputs.runner }} 
-    if: ${{ inputs.ut != 'xpu_distributed' }}
+    if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }}
     timeout-minutes: 900
     env:
       NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}

From edc9e1b5bcde0adf04d47a634ab413cbae41c05a Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 23:02:20 +0000
Subject: [PATCH 18/83] keep_torch_xpu_ops

---
 .github/workflows/pull.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index be9d35397..eec6b2893 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -78,6 +78,7 @@ jobs:
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: distributed_2.8
+      keep_torch_xpu_ops: true
       runner: pvc_e2e
 
   preci-ut-distributed:

From 6c9e99adf2288f6652d0ccc8b84749e353800b85 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sun, 6 Apr 2025 19:24:08 +0000
Subject: [PATCH 19/83] update keyword in distributed ut check

---
 .github/scripts/ut_result_check.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 32dbed489..9bf611786 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -73,10 +73,10 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
     fi
 fi
 if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
-    grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
     num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     echo -e "========================================================================="
-    echo -e "Show Failed cases in ${ut_suite} xpu distributed"
+    echo -e "Show Failed cases in ${ut_suite}"
     echo -e "========================================================================="
     cat "./${ut_suite}_test_failed.log"
     ((num_failed=num_failed_distributed))

From bdfa8536c16191cede8c9fd5710e1b90a8e526cc Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 13:37:10 +0000
Subject: [PATCH 20/83] update pytorch build

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index e31d1e27b..f17d02a0c 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -67,22 +67,20 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            # apply PRs for stock pytorch
-            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-            git status && git show -s
-            git submodule sync && git submodule update --init --recursive
-            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-              echo "Don't replace torch-xpu-ops!"
-            else
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-              # Workaround for torch-xpu-ops ci test
-              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-            fi
+          fi
+          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+          # apply PRs for stock pytorch
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
+          else
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
           fi
       - name: Build Pytorch XPU
         run: |

From 0e77f3030f4e03c4b2cbadf19e1d3cf7c523d744 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 14:55:26 +0000
Subject: [PATCH 21/83] update if condition

---
 .github/workflows/_linux_ut.yml | 2 +-
 .github/workflows/pull.yml      | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 94dacaf54..deddcc5db 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -44,7 +44,7 @@ permissions: read-all
 jobs:
   ut_test:
     runs-on: ${{ inputs.runner }} 
-    if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }}
+    if: ${{ inputs.ut != 'xpu_distributed' && inputs.ut != 'pytorch_distributed' }}
     timeout-minutes: 900
     env:
       NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index eec6b2893..be9d35397 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -78,7 +78,6 @@ jobs:
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: distributed_2.8
-      keep_torch_xpu_ops: true
       runner: pvc_e2e
 
   preci-ut-distributed:

From 4076a1a940d148137f9f530c5efface6ba2365d4 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 18:12:34 +0000
Subject: [PATCH 22/83] resolve Artifact name conflict

---
 .github/workflows/_linux_build.yml |  4 ++--
 .github/workflows/_linux_ut.yml    | 15 +++++----------
 .github/workflows/pull.yml         |  4 ++--
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index f17d02a0c..ae6c2064c 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -171,11 +171,11 @@ jobs:
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
           path: ${{ github.workspace }}/torch*.whl
       - name: Upload Build Log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
           path: ${{ github.workspace }}/pytorch_*.log
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index deddcc5db..0e8265639 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -95,7 +95,7 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
@@ -339,7 +339,7 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
@@ -474,18 +474,13 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
           source .github/scripts/env.sh ${{ inputs.pytorch }}
           pip install mkl-static==2025.0.1 mkl-include==2025.0.1
-          if [[ ${{ inputs.abi }} == '0' ]]; then
-            export _GLIBCXX_USE_CXX11_ABI=0
-          else
-            export _GLIBCXX_USE_CXX11_ABI=1
-          fi
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
             cd ../pytorch
             export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
@@ -534,7 +529,7 @@ jobs:
             echo -e "[ERROR] XCCL is not enabled"
             exit 1
           fi
-          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
           cd ${{ github.workspace }}
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
       - name: UT Test Results Check
@@ -558,5 +553,5 @@ jobs:
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed
           path: ${{ github.workspace }}/ut_log
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index be9d35397..0e9ee9f63 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -88,7 +88,7 @@ jobs:
     needs: preci-linux-build-distributed
     uses: ./.github/workflows/_linux_ut.yml
     with:
-      pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
+      pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }}
       ut: pytorch_distributed
       runner: pvc_e2e
 
@@ -137,7 +137,7 @@ jobs:
         if: ${{ inputs.pytorch }} != 'nightly_wheel'
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ needs.preci-linux-build.outputs.torch_commit_id }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |

From 5596ac4436e9d6b1b0367915b3d52ea25c408b5b Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Mon, 7 Apr 2025 23:41:37 -0700
Subject: [PATCH 23/83] enabled test_sharder.py on xpu

---
 test/xpu/skip_list_dist_local.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 0ac46961e..218746b71 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -137,6 +137,7 @@
     "../../../../test/distributed/pipelining/test_transformer.py": None,
     "../../../../test/distributed/pipelining/test_unflatten.py": None,
     "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/_shard/test_sharder.py": None,
 }
 
 skip_dict_python = {

From 2ed797354aab68575dc8c4ee0f746c9eef9eadac Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Wed, 9 Apr 2025 00:18:27 -0700
Subject: [PATCH 24/83] Enabled UT for test/distributed/tensor

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 79 ++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 0ac46961e..42cdebf19 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -136,7 +136,85 @@
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
     "../../../../test/distributed/pipelining/test_unflatten.py": None,
+    "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": (
+        # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter'
+        # is not currently implemented for the XPU device
+        # https://github.com/intel/torch-xpu-ops/issues/1547
+        "test_dtensor_seq_par_shard_dim_0",
+        "test_dtensor_seq_par_shard_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
+        "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
+        # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......'
+        # https://github.com/intel/torch-xpu-ops/issues/1548
+        "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True",
+        # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......'
+        # https://github.com/intel/torch-xpu-ops/issues/1549
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True",
+        # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device.
+        # https://github.com/intel/torch-xpu-ops/issues/1550
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True",
+        # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter'
+        # is not currently implemented for the XPU device.
+        # https://github.com/intel/torch-xpu-ops/issues/1551
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2",
+    ),
+    "../../../../test/distributed/tensor/parallel/test_tp_examples.py": (
+        # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!
+        # https://github.com/intel/torch-xpu-ops/issues/1555
+        "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings",
+        "test_transformer_training_is_seq_parallel_False_float32",
+        "test_transformer_training_is_seq_parallel_True_float32",
+        # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered.
+        # https://github.com/intel/torch-xpu-ops/issues/1556
+        "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output",
+    ),
     "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_style.py": None,
+    "../../../../test/distributed/tensor/test_api.py": None,
+    "../../../../test/distributed/tensor/test_attention.py": None,
+    "../../../../test/distributed/tensor/test_common_rules.py": None,
+    "../../../../test/distributed/tensor/test_dtensor.py": None,
+    "../../../../test/distributed/tensor/test_dtensor_compile.py": None,
+    "../../../../test/distributed/tensor/test_experimental_ops.py": None,
+    "../../../../test/distributed/tensor/test_init.py": None,
+    "../../../../test/distributed/tensor/test_math_ops.py": (
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        "test_mean",
+        "test_nll_loss_and_cross_entropy",
+    ),
+    "../../../../test/distributed/tensor/test_random_ops.py": None,
+    "../../../../test/distributed/tensor/test_redistribute.py": None,
+    "../../../../test/distributed/tensor/test_tensor_ops.py": None,
+    "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None,
 }
 
 skip_dict_python = {
@@ -144,4 +222,5 @@
     "distributed/test_c10d_xccl.py": None,
     "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
 }

From 5bab858cbde56b7319c43690157aee43d06917f3 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 9 Apr 2025 23:57:58 -0700
Subject: [PATCH 25/83] add FSDP2 cases, improved check-ut.py for summary, do
 ZE_AFFINITY_MASK configuration before import torch

---
 .github/scripts/check-ut.py       |  5 ++++-
 test/xpu/run_distributed_local.py |  3 ++-
 test/xpu/skip_list_dist_local.py  | 17 +++++++++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 8cd490bc8..9d9e4edfd 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -30,7 +30,8 @@ def get_result(case):
 def get_message(case):
     if not case.result:
         return ""
-    return f"{case.result[0].message.splitlines()[0]}"
+    #return f" for line in {case.result[0].message.splitlines()}"
+    return [item for item in case.result[0].message.splitlines() if "Error:" in item]
 
 def print_md_row(row, print_header):
     if print_header:
@@ -75,6 +76,8 @@ def print_suite(suite):
             category = 'op_extended'
         elif 'op_ut' in ut:
             category = 'op_ut'
+        else:
+            category = "default"
         row = {
             'Category': category,
             'UT': ut,
diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index d4db4785a..1c2435e15 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -3,7 +3,6 @@
 import sys
 
 from skip_list_dist_local import skip_dict, skip_dict_python
-from xpu_test_utils import launch_test
 
 res = 0
 res2 = 0
@@ -50,6 +49,8 @@
     sys.exit(255)
 
 
+from xpu_test_utils import launch_test
+
 # run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 5629046d9..a41c91f18 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -216,6 +216,23 @@
     "../../../../test/distributed/tensor/test_tensor_ops.py": None,
     "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None,
     "../../../../test/distributed/_shard/test_sharder.py": None,
+    # FSDP2
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": None,
 }
 
 skip_dict_python = {

From f1b824d7764ddf88989f1960519a84dc449fbb56 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 10 Apr 2025 01:27:23 -0700
Subject: [PATCH 26/83] Skip test_schedule_multiproc.py for hang error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 5629046d9..b2984fb17 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -221,7 +221,7 @@
 skip_dict_python = {
     "distributed/test_c10d_ops_xccl.py": None,
     "distributed/test_c10d_xccl.py": None,
-    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
+    # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
 }

From f696faad63d48e4a2e65a15340c998aedc9d529d Mon Sep 17 00:00:00 2001
From: Cheng Penghui <penghui.cheng@intel.com>
Date: Mon, 14 Apr 2025 23:14:30 -0700
Subject: [PATCH 27/83] refine error log for test files without pytest

Signed-off-by: Cheng Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 52 ++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index d4db4785a..96761cd82 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -8,6 +8,7 @@
 res = 0
 res2 = 0
 fail_test = []
+error_log = ""
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
@@ -59,20 +60,49 @@ def run(test_command):
 
 
 for key in skip_dict_python:
-    skip_list = skip_dict_python[key]
+    skip_list = skip_dict_python[key] if skip_dict_python[key] else []
     test_command = ["python", key]
     fail = run(test_command)
     if fail.returncode:
-        for line in fail.stderr.split("\n"):
-            if "FAIL: " in line:
-                is_error = True
-                for skip_case in skip_list:
-                    if skip_case in line:
-                        print("Skiped error: ", key + " " + skip_case)
-                        is_error = False
-                if is_error:
-                    res2 += fail.returncode
-                    fail_test.append("".join(key + " " + line))
+        num_skipped = 0
+        num_err = 0
+        for i, err in enumerate(fail.stderr.split("FAIL: ")):
+            if i == 0 and len(err) > 0:
+                error_log += err
+                continue
+            is_skipped = False
+            for skip_case in skip_list:
+                if skip_case in err:
+                    print("Skipped error: ", key + " " + skip_case)
+                    num_skipped += 1
+                    is_skipped = True
+                    break
+            if not is_skipped:
+                num_err += 1
+                res2 += fail.returncode
+                if i == len(fail.stderr.split("FAIL: ")) - 1:
+                    error_log += "FAIL: "
+                    for line in err.split("\n"):
+                        if line.startswith("FAILED (failures="):
+                            num_errs = line.split("=")[1].split(")")[0].strip()
+                            error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n")
+                        else:
+                            error_log += (line + "\n")
+                else:
+                    error_log += ("FAIL: " + err)
+            else:
+                if i == len(fail.stderr.split("FAIL: ")) - 1:
+                    error_log += "FAIL: "
+                    for line in err.split("\n"):
+                        if line.startswith("FAILED (failures="):
+                            num_errs = line.split("=")[1].split(")")[0].strip()
+                            error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n")
+
+    if num_err > 0:
+        fail_test.append(key)
+        renamed_key = key.replace("../../../../", "").replace("/", "_")
+        with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
+            f.write(error_log)
 
 # run pytest with skiplist
 for key in skip_dict:

From 00326ac761623a735718105609c6e0cb05686a7c Mon Sep 17 00:00:00 2001
From: Cheng Penghui <penghui.cheng@intel.com>
Date: Tue, 15 Apr 2025 01:50:09 -0700
Subject: [PATCH 28/83] Fixed error for create log file without pytest

Signed-off-by: Cheng Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 46a0be814..46905cef1 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -64,9 +64,9 @@ def run(test_command):
     skip_list = skip_dict_python[key] if skip_dict_python[key] else []
     test_command = ["python", key]
     fail = run(test_command)
+    num_skipped = 0
+    num_err = 0
     if fail.returncode:
-        num_skipped = 0
-        num_err = 0
         for i, err in enumerate(fail.stderr.split("FAIL: ")):
             if i == 0 and len(err) > 0:
                 error_log += err
@@ -99,11 +99,16 @@ def run(test_command):
                             num_errs = line.split("=")[1].split(")")[0].strip()
                             error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n")
 
+    renamed_key = key.replace("../../../../", "").replace("/", "_")
     if num_err > 0:
         fail_test.append(key)
-        renamed_key = key.replace("../../../../", "").replace("/", "_")
         with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
             f.write(error_log)
+    else:
+        import pdb;pdb.set_trace()
+        with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
+            f.write(fail.stdout)
+            f.write(fail.stderr)
 
 # run pytest with skiplist
 for key in skip_dict:

From 59c609e66945c3b4d2dae80a3f909256451be4e3 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Tue, 15 Apr 2025 23:07:01 -0700
Subject: [PATCH 29/83] Skipped cases rasied issue

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py |   3 -
 test/xpu/skip_list_dist_local.py  | 271 +++++++++++++++++++++++++++---
 2 files changed, 246 insertions(+), 28 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 46a0be814..63a588416 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -9,9 +9,6 @@
 fail_test = []
 error_log = ""
 
-os.environ["CCL_ATL_TRANSPORT"] = "ofi"
-os.environ["CCL_SEND"] = "direct"
-os.environ["CCL_RECV"] = "direct"
 os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index e6a2a34f3..9ec4c59e0 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -7,19 +7,120 @@
     # ),
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_submodule_use_reentrant_False_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
+       # https://github.com/intel/torch-xpu-ops/issues/1504
+       "test_ddp_parity_xpu",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_bf16_hook_has_wrapping_False_sharding_strategy0",
+        "test_bf16_hook_has_wrapping_False_sharding_strategy1",
+        "test_bf16_hook_has_wrapping_False_sharding_strategy2",
+        "test_bf16_hook_has_wrapping_True_sharding_strategy0",
+        "test_bf16_hook_has_wrapping_True_sharding_strategy1",
+        "test_bf16_hook_has_wrapping_True_sharding_strategy2",
+        "test_fp16_hook_has_wrapping_False_sharding_strategy1",
+        "test_fp16_hook_has_wrapping_False_sharding_strategy2",
+        "test_fp16_hook_has_wrapping_True_sharding_strategy0",
+        "test_fp16_hook_has_wrapping_True_sharding_strategy1",
+        "test_fp16_hook_has_wrapping_True_sharding_strategy2",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_delayed_optim_step_offload_true_no_shard_xpu",
         "test_transformer_no_grad_mixed_precision_True_xpu",
+        "test_delayed_optim_step_offload_false_no_shard_xpu",
+        "test_delayed_optim_step_offload_false_none_xpu",
+        "test_delayed_optim_step_offload_false_shard_grad_op_xpu",
+        "test_delayed_optim_step_offload_true_none_xpu",
+        "test_delayed_optim_step_offload_true_shard_grad_op_xpu",
+        "test_delayed_reduce_scatter_offload_false_no_shard_xpu",
+        "test_delayed_reduce_scatter_offload_false_none_xpu",
+        "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
+        "test_delayed_reduce_scatter_offload_true_none_xpu",
+        "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
+        "test_mixture_of_experts_offload_false_no_shard_xpu",
+        "test_mixture_of_experts_offload_false_none_xpu",
+        "test_mixture_of_experts_offload_false_shard_grad_op_xpu",
+        "test_mixture_of_experts_offload_true_none_xpu",
+        "test_mixture_of_experts_offload_true_shard_grad_op_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
+        "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
+        "test_nested_always_wrap_model_offload_false_no_shard_xpu",
+        "test_nested_always_wrap_model_offload_false_none_xpu",
+        "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
+        "test_nested_always_wrap_model_offload_true_none_xpu",
+        "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
+        "test_nested_wrapped_model_offload_false_no_shard_xpu",
+        "test_nested_wrapped_model_offload_false_none_xpu",
+        "test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
+        "test_nested_wrapped_model_offload_true_none_xpu",
+        "test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
+        "test_transformer_offload_false_none_xpu",
+        "test_transformer_offload_false_shard_grad_op_xpu",
+        "test_transformer_offload_true_none_xpu",
+        "test_transformer_offload_true_shard_grad_op_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
     ),
-    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_hooks_multi_traversal_xpu",
+        "test_parity_with_ddp_xpu",
+        "test_parity_with_non_frozen_fsdp_xpu",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True ",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
+        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
@@ -28,28 +129,89 @@
     "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1535
         "test_fsdp_zero2_eval_with_prefetch",
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_fsdp_optimizer_overlap",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_multi_forward_cpu",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
     # https://github.com/intel/torch-xpu-ops/issues/1537
     "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_flatten_sharded_optim_state_dict_nested",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False",
+        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_False",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_True",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_False",
+        "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_True",
+        "test_rekey_optim_state_dict_to_names",
+        "test_scatter_full_optim_state_dict_nested_halve_world_size",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_halve_world_size",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False",
+        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True",
         "test_use_orig_params",
     ),
     # Performance check, skip
     # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    #    # https://github.com/intel/torch-xpu-ops/issues/1504
     #    "test_forward_overlap",
     #    "test_forward_overlap_xpu",
     # ),
     "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none",
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none",
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_true_none_none_none",
+        "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_state_dict_save_load_flow_state_dict_type_local_state_dict",
+        "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict",
+        "test_state_dict_save_load_flow_state_dict_type_state_dict",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_diff_hyperparams_sharding_strategy_str_full_shard",
+        "test_diff_hyperparams_sharding_strategy_str_no_shard",
+        "test_diff_hyperparams_sharding_strategy_str_shard_grad_op",
+        "test_no_sync_correctness",
+    ),
     "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
     "../../../../test/distributed/fsdp/test_shard_utils.py": None,
     "../../../../test/distributed/fsdp/test_utils.py": None,
@@ -127,11 +289,20 @@
     ),
     "../../../../test/distributed/test_multi_threaded_pg.py": (
         # oneccl not support multi-threaded well, so skip it first.
+        # https://github.com/intel/torch-xpu-ops/issues/1509
         "test_bwd_sees_fwd_pg",
     ),
     "../../../../test/distributed/test_store.py": None,
-    "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/pipelining/test_microbatch.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_stage_backward_weight_multiple_iters_xpu",
+        "test_stage_backward_weight_xpu",
+        "test_stage_backward_xpu",
+    ),
+    "../../../../test/distributed/pipelining/test_microbatch.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_chunk_spec_xpu",
+    ),
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
@@ -184,7 +355,7 @@
     "../../../../test/distributed/tensor/parallel/test_tp_examples.py": (
         # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!
         # https://github.com/intel/torch-xpu-ops/issues/1555
-        "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_all",
         "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings",
         "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings",
         "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings",
@@ -201,9 +372,29 @@
     "../../../../test/distributed/tensor/test_api.py": None,
     "../../../../test/distributed/tensor/test_attention.py": None,
     "../../../../test/distributed/tensor/test_common_rules.py": None,
-    "../../../../test/distributed/tensor/test_dtensor.py": None,
-    "../../../../test/distributed/tensor/test_dtensor_compile.py": None,
-    "../../../../test/distributed/tensor/test_experimental_ops.py": None,
+    "../../../../test/distributed/tensor/test_dtensor.py": (
+        # Passed with updated test code for world_size 8
+        "test_auto_implicit_replication",
+        "test_default_value_sub_mesh",
+        "test_device_mesh_nd",
+        "test_dtensor_2d_mesh",
+        "test_dtensor_api_device_mesh_context_manager",
+        "test_dtensor_device_mesh_device_conversion",
+        "test_dtensor_spec_local_shard_offset",
+        "test_from_local_sub_mesh",
+        "test_implicit_replication",
+        "test_metadata_consistency_check",
+        "test_redistribute_sub_mesh",
+        "test_split_tensor_1D",
+    ),
+    "../../../../test/distributed/tensor/test_dtensor_compile.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_2d_fsdp_tp_compile",
+    ),
+    "../../../../test/distributed/tensor/test_experimental_ops.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_bernoulli",
+    ),
     "../../../../test/distributed/tensor/test_init.py": None,
     "../../../../test/distributed/tensor/test_math_ops.py": (
         # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
@@ -211,28 +402,58 @@
         "test_mean",
         "test_nll_loss_and_cross_entropy",
     ),
-    "../../../../test/distributed/tensor/test_random_ops.py": None,
-    "../../../../test/distributed/tensor/test_redistribute.py": None,
+    "../../../../test/distributed/tensor/test_random_ops.py": (
+        # Need to update world size
+        "test_hsdp_tp_model_meta_init",
+    ),
+    "../../../../test/distributed/tensor/test_redistribute.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_redistribute_shard_dim_multi_dim_mesh",
+    ),
     "../../../../test/distributed/tensor/test_tensor_ops.py": None,
     "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None,
     "../../../../test/distributed/_shard/test_sharder.py": None,
     # FSDP2
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_clip_grad_norm_2d",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1571
+        "test_set_reduce_scatter_divide_factor",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        "test_gradient_scaler",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_fully_shard_training_memory",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": (
+        # Performance test, should skip
+        "test_fully_shard_training_overlap",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1572
+        "test_dp_state_dict_cpu_offload",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        "test_post_optim_event",
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_train_parity_multi_group_unshard_async_op",
+        "test_train_parity_with_activation_checkpointing",
+    ),
 }
 
 skip_dict_python = {

From de00feb16f2d68bf2f1d752bb3d5458eca4cf223 Mon Sep 17 00:00:00 2001
From: Zhong Ruijie <109201212+RUIJIEZHONG66166@users.noreply.github.com>
Date: Wed, 16 Apr 2025 16:49:41 +0800
Subject: [PATCH 30/83] Update ut summary

---
 .github/scripts/check-ut.py     | 302 +++++++++++++++++++++++---------
 .github/workflows/_linux_ut.yml |  12 ++
 2 files changed, 235 insertions(+), 79 deletions(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 9d9e4edfd..7e7c6ecd4 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -1,22 +1,47 @@
 import argparse
 import sys
 import os
+import re
 from junitparser import JUnitXml, Error, Failure, Skipped
 
-parser = argparse.ArgumentParser()
-parser.add_argument('junitxml', nargs='+')
+parser = argparse.ArgumentParser(description='Test results analyzer')
+parser.add_argument('input_files', nargs='+', help='JUnit XML files or log files')
 args = parser.parse_args()
 
 failures = []
-suites = []
+summaries = []
+
+error_types = [
+    "RuntimeError",
+    "ValueError",
+    "TypeError",
+    "AttributeError",
+    "KeyError",
+    "IndexError",
+    "ImportError",
+    "AssertionError",
+    "Exception",
+    "OSError",
+    "Failed",
+    "TimeoutError",
+    "asyncio.TimeoutError",
+    "FileNotFoundError",
+    "PermissionError",
+    "NotImplementedError",
+]
 
 def get_classname(case):
-    return ' '.join(case.classname.split())
+    return ' '.join(case.classname.split()) if hasattr(case, 'classname') else case.get('classname', '')
 
 def get_name(case):
+    if isinstance(case, dict):
+        return case.get('name', '')
     return ' '.join(case.name.split())
 
 def get_result(case):
+    if isinstance(case, dict):
+        return case.get('status', 'failed')
+
     result = "passed"
     if case.result:
         if isinstance(case.result[0], Error):
@@ -28,91 +53,210 @@ def get_result(case):
     return result
 
 def get_message(case):
+    if isinstance(case, dict):
+        return case.get('error', '')
+
     if not case.result:
         return ""
-    #return f" for line in {case.result[0].message.splitlines()}"
-    return [item for item in case.result[0].message.splitlines() if "Error:" in item]
 
-def print_md_row(row, print_header):
+    full_text = case.result[0].text if hasattr(case.result[0], 'text') else case.result[0].message
+    if not full_text:
+        return ""
+
+    error_messages = []
+    capture_next_lines = False
+    indent_level = 0
+
+    for line in full_text.splitlines():
+        stripped_line = line.strip()
+        if not stripped_line:
+            continue
+
+        for error_type in error_types:
+            if stripped_line.startswith(error_type + ": "):
+                error_msg = stripped_line[len(error_type)+2:]
+                error_messages.append(f"{error_type}: {error_msg}")
+                capture_next_lines = True
+                indent_level = 0
+                break
+            elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line:
+                error_msg = stripped_line.split(f'{error_type}:')[-1].strip()
+                error_messages.append(f"{error_type}: {error_msg}")
+                capture_next_lines = True
+                indent_level = 0
+                break
+
+    return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}"
+
+
+def print_md_row(row, print_header=False):
     if print_header:
-        header = " | ".join([f"{key}" for key, _ in row.items()])
+        header = " | ".join([f"{key}" for key in row.keys()])
         print(f"| {header} |")
-        header = " | ".join(["-"*len(key) for key, _ in row.items()])
+        header = " | ".join(["---"] * len(row))
         print(f"| {header} |")
-    row = " | ".join([f"{value}" for _, value in row.items()])
-    print(f"| {row} |")
+    row_values = " | ".join([f"{value}" for value in row.values()])
+    print(f"| {row_values} |")
+
+def print_failures():
+    if not failures:
+        return
 
-def print_cases(cases):
+    print("### Test Failures")
     print_header = True
-    for case in cases:
-        classname = get_classname(case)
-        name = get_name(case)
-        result = get_result(case)
-        message = get_message(case)
-        row = {
-            'Class name': classname,
-            'Test name': name,
-            'Status': result,
-            'Message': message,
-        }
-        print_md_row(row, print_header)
+    for case in failures:
+        print_md_row({
+            'Class name': get_classname(case),
+            'Test name': get_name(case),
+            'Status': get_result(case),
+            'Message': get_message(case),
+            'Source': case['source'] if isinstance(case, dict) else 'XML'
+        }, print_header)
         print_header = False
 
-def print_suite(suite):
-    print_header = True
-    for suite in suites:
-        ut = args.junitxml[0]
-        del(args.junitxml[0])
-        ut = os.path.basename(ut).split('.')[0]
-        tests = suite.tests
-        skipped = suite.skipped
-        failures = suite.failures
-        errors = suite.errors
-        if ut == 'op_regression':
-            category = 'op_regression'
-        elif ut == 'op_regression_dev1':
-            category = 'op_regression_dev1'
-        elif ut == 'op_extended':
-            category = 'op_extended'
-        elif 'op_ut' in ut:
-            category = 'op_ut'
+def parse_log_file(log_file):
+    with open(log_file, encoding='utf-8') as f:
+        content = f.read()
+
+    ut_name = os.path.splitext(os.path.basename(log_file))[0]
+    summary = {
+        'Category': determine_category(ut_name),
+        'UT': ut_name,
+        'Test cases': 0,
+        'Passed': 0,
+        'Skipped': 0,
+        'Failures': 0,
+        'Errors': 0,
+        'Source': 'Log'
+    }
+
+    # Extract test counts
+    test_run_match = re.search(r"Ran (\d+) tests in [\d.]+s", content)
+    if test_run_match:
+        summary['Test cases'] = int(test_run_match.group(1))
+
+    # Extract skipped case number
+    skipped_match = re.search(r"skipped[ =](\d+)", content, re.IGNORECASE)
+    if skipped_match:
+        summary['Skipped'] = int(skipped_match.group(1))
+    else:
+        skipped_match = re.search(r"skipped (\d+) cases?", content, re.IGNORECASE)
+        if skipped_match:
+            summary['Skipped'] = int(skipped_match.group(1))
+
+    # Extract failures
+    failure_blocks = re.findall(r"(FAIL:.*?)(?:\n\n|\n=+\n|\Z)", content, re.DOTALL)
+    exist_test_names = set()
+    failures_number = 0
+
+    for block in failure_blocks:
+        case_match = re.match(r"FAIL: (\w+) \(__mp_main__\.(\w+)\)", block)
+        if not case_match:
+            continue
+
+        test_name = case_match.group(1)
+        if test_name in exist_test_names:
+            continue
+        exist_test_names.add(test_name)
+
+        error_msg = []
+        error_pattern = r"(" + "|".join(error_types) + r"):.*?(?=\n\S|\n\n|\n=+\n|\Z)"
+        error_matches = re.finditer(error_pattern, block, re.DOTALL)
+        if not error_matches and "Traceback" in block:
+            error_msg.append("Unknown error (see traceback)")
         else:
-            category = "default"
-        row = {
-            'Category': category,
-            'UT': ut,
-            'Test cases': tests,
-            'Passed': tests-skipped-failures-errors,
-            'Skipped': skipped,
-            'Failures': failures,
-            'Errors': errors,
-        }
-        print_md_row(row, print_header)
+            for match in error_matches:
+                error_msg.append(match.group(0).strip())
+
+        failures.append({
+            'classname': ut_name,
+            'name': f"{case_match.group(2)}:{test_name}",
+            'error': " ".join(error_msg),
+            'status': 'failed',
+            'source': 'Log'
+        })
+        failures_number += 1
+
+    if failures_number > summary['Failures']:
+        summary['Failures'] = failures_number
+    summary['Passed'] = summary['Test cases'] - summary['Failures'] - summary['Skipped']
+
+    return summary
+
+def determine_category(ut):
+    if ut == 'op_regression':
+        return 'op_regression'
+    elif ut == 'op_regression_dev1':
+        return 'op_regression_dev1'
+    elif ut == 'op_extended':
+        return 'op_extended'
+    elif 'op_ut' in ut:
+        return 'op_ut'
+    else:
+        return 'unknown'
+
+def process_log_file(log_file):
+    try:
+        summary = parse_log_file(log_file)
+        summaries.append(summary)
+    except Exception as e:
+        print(f"Error processing {log_file}: {e}", file=sys.stderr)
+
+def process_xml_file(xml_file):
+    try:
+        xml = JUnitXml.fromfile(xml_file)
+        ut = os.path.basename(xml_file).split('.')[0]
+        category = determine_category(ut)
+
+        for suite in xml:
+            suite_summary = {
+                'Category': category,
+                'UT': ut,
+                'Test cases': suite.tests,
+                'Passed': suite.tests - suite.skipped - suite.failures - suite.errors,
+                'Skipped': suite.skipped,
+                'Failures': suite.failures,
+                'Errors': suite.errors,
+                'Source': 'XML'
+            }
+            summaries.append(suite_summary)
+
+            for case in suite:
+                if get_result(case) not in ["passed", "skipped"]:
+                    failures.append(case)
+    except Exception as e:
+        print(f"Error processing {xml_file}: {e}", file=sys.stderr)
+
+def print_summary():
+    print("### Results Summary")
+    print_header = True
+
+    for summary in summaries:
+        print_md_row({
+            'Category': summary['Category'],
+            'UT': summary['UT'],
+            'Test cases': summary['Test cases'],
+            'Passed': summary['Passed'],
+            'Skipped': summary['Skipped'],
+            'Failures': summary['Failures'],
+            'Errors': summary['Errors'],
+            'Source': summary['Source']
+        }, print_header)
+
         print_header = False
 
-xmls = [ JUnitXml.fromfile(f) for f in args.junitxml ]
-for idx, xml in enumerate(xmls):
-    for suite in xml:
-        suites.append(suite)
-        for case in suite:
-            classname = get_classname(case)
-            name = get_name(case)
-            result = get_result(case)
-            if result not in ["passed", "skipped"]:
-                failures.append(case)
-
-printed = False
-def print_break(needed):
-    if needed:
-        print("")
-
-if failures:
-    print_break(printed)
-    print("### Failures")
-    print_cases(failures)
-    printed = True
-
-print("### Results Summary")
-print_suite(suites)
-
-sys.exit(0)
+def main():
+    for input_file in args.input_files:
+        if input_file.endswith('.log'):
+            process_log_file(input_file)
+        elif input_file.endswith('.xml'):
+            process_xml_file(input_file)
+        else:
+            print(f"Skipping unknown file type: {input_file}", file=sys.stderr)
+
+    print_failures()
+    print_summary()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index f1410abec..8a4cc0b45 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -182,6 +182,18 @@ jobs:
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu
           timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test_error.log | tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log
           cp *.xml ${{ github.workspace }}/ut_log
+          find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c '
+              dir_path=$(dirname "$1");
+              case "$dir_path" in
+                  *"op_ut_with_skip_quantization/core"*)
+                      dir_name="op_ut_with_skip_quantization_core";;
+                  *)
+                      dir_name=$(basename "$dir_path");;
+              esac;
+              mv "$1" "$dir_path/${dir_name}_$(basename "$1")"
+          ' _ {} \;
+          cp op_ut_with_skip_nn/*.xml ${{ github.workspace }}/ut_log
+          cp op_ut_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log
           # Cases run with a on-demand white list, since some suites are too
           # slow to go through all operators on CPU. So add cases on-demand
           # when XPU implementatoin is done.

From 4c3651eae2af3fcb3d0d1c2039e4210110a01de7 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 17:03:10 +0000
Subject: [PATCH 31/83] update ut

---
 .github/workflows/_linux_ut.yml | 140 --------------------------------
 1 file changed, 140 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 8a4cc0b45..7ac9615dd 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -564,143 +564,3 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed
           path: ${{ github.workspace }}/ut_log
-
-  pytorch_distributed_test:
-    runs-on: ${{ inputs.runner }}
-    if: contains(inputs.ut, 'pytorch_distributed')
-    timeout-minutes: 900
-    env:
-      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v4
-      - name: Prepare Stock Pytorch
-        run: |
-          pwd
-          which conda && conda clean -ay
-          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
-                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
-          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../ && rm -rf pytorch
-          pip install requests
-          git clone https://github.com/daisyden/pytorch.git pytorch
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            # apply PRs for stock pytorch
-            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-            git status && git show -s
-            git submodule sync && git submodule update --init --recursive
-            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-              echo "Don't replace torch-xpu-ops!"
-            else
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-              # Workaround for torch-xpu-ops ci test
-              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-            fi
-          fi
-      - name: Triton Installation
-        run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../pytorch
-          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
-          if [ -z ${{ inputs.triton }} ]; then
-            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
-          else
-            TRITON_COMMIT_ID="${{ inputs.triton }}"
-          fi
-          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
-          fi
-      - name: Download Pytorch wheel
-        if: ${{ inputs.pytorch != 'nightly_wheel' }}
-        uses: actions/download-artifact@v4
-        with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
-          path: ${{ github.workspace }}
-      - name: Install Pytorch XPU
-        run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          source .github/scripts/env.sh ${{ inputs.pytorch }}
-          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
-          if [[ ${{ inputs.abi }} == '0' ]]; then
-            export _GLIBCXX_USE_CXX11_ABI=0
-          else
-            export _GLIBCXX_USE_CXX11_ABI=1
-          fi
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd ../pytorch
-            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-            pip install -r requirements.txt
-            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
-            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
-          else
-            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
-            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-            cd ../pytorch
-            git reset --hard && git checkout ${TORCH_COMMIT_ID}
-            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
-            rm -rf third_party/torch-xpu-ops
-            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
-            cd third_party/torch-xpu-ops
-            git checkout ${TORCH_XPU_OPS_COMMIT}
-            cd ../..
-            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          fi
-          pip install -r .ci/docker/requirements-ci.txt
-      - name: Torch Config
-        run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          source .github/scripts/env.sh ${{ inputs.pytorch }}
-          python -c "import torch; print(torch.__config__.show())"
-          python -c "import torch; print(torch.__config__.parallel_info())"
-          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
-          python -c "import triton; print(triton.__version__)"
-
-          cd ..
-          python pytorch/torch/utils/collect_env.py
-          rm -rf /tmp/torchinductor_*
-          rm -rf ~/.triton/cache
-      - name: Run Torch XPU Distributed UT
-        run: |
-          source .github/scripts/env.sh ${{ inputs.pytorch }}
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          pip install pytest
-          cd ${{ github.workspace }}
-          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
-          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
-          mkdir -p ut_log/pytorch_distributed
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
-          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
-          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
-            echo -e "[ERROR] XCCL is not enabled"
-            exit 1
-          fi
-          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
-          cd ${{ github.workspace }}
-          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
-      - name: UT Test Results Check
-        shell: bash
-        run: |
-          function contains() {
-              contains_status="echo 'Start $2 ...'"
-              {
-                [[ $1 =~ (^|,)$2($|,) ]]
-              } || {
-                echo "[Warning] $2 is not suppotted type! Skipped!"
-                contains_status="continue"
-              }
-          }
-          set -xe
-          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          cd ${{ github.workspace }}/ut_log/pytorch_distributed
-          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
-          bash ut_result_check.sh 'pytorch_distributed'
-      - name: Upload Inductor XPU UT Log
-        if: ${{ ! cancelled() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
-          path: ${{ github.workspace }}/ut_log

From 6f635a7c64e22fc28dadef515841acdf42ce04f7 Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Wed, 16 Apr 2025 02:30:33 -0700
Subject: [PATCH 32/83] add distributed ut summary

---
 .github/workflows/_linux_ut.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 7ac9615dd..183a95680 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -540,7 +540,14 @@ jobs:
           fi
           python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
           cd ${{ github.workspace }}
+          mkdir -p ut_log/pytorch_distributed_summary
+          cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: Distributed UT Test Results Summary
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install junitparser
+          python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/pytorch_distributed_summary/* >> $GITHUB_STEP_SUMMARY || true
       - name: UT Test Results Check
         shell: bash
         run: |

From f0e1128d1ceccb8f49ac4285854ee81a52d1dc40 Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Wed, 16 Apr 2025 02:32:39 -0700
Subject: [PATCH 33/83] align the path

---
 .github/workflows/_linux_ut.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 183a95680..937425699 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -532,6 +532,7 @@ jobs:
           sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
           sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
           mkdir -p ut_log/pytorch_distributed
+          mkdir -p ut_log/pytorch_distributed_summary
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu
           XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
           if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
@@ -539,9 +540,8 @@ jobs:
             exit 1
           fi
           python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
-          cd ${{ github.workspace }}
-          mkdir -p ut_log/pytorch_distributed_summary
           cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
+          cd ${{ github.workspace }}
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
       - name: Distributed UT Test Results Summary
         run: |

From e9b1ba9b060d85920f0f185317f0bce6a4d0a3cb Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 17:19:12 +0000
Subject: [PATCH 34/83] fix lint issue

---
 test/xpu/run_distributed_local.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index cc40373bc..9cd07cc55 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -83,18 +83,27 @@ def run(test_command):
                     for line in err.split("\n"):
                         if line.startswith("FAILED (failures="):
                             num_errs = line.split("=")[1].split(")")[0].strip()
-                            error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n")
+                            error_log += (
+                                "FAILED (failures=" 
+                                + str(int(num_errs) - num_skipped) 
+                                + f" skipped {num_skipped} cases" 
+                                + ")\n")
                         else:
-                            error_log += (line + "\n")
+                            error_log += line + "\n"
                 else:
-                    error_log += ("FAIL: " + err)
+                    error_log += "FAIL: " + err
             else:
                 if i == len(fail.stderr.split("FAIL: ")) - 1:
                     error_log += "FAIL: "
                     for line in err.split("\n"):
                         if line.startswith("FAILED (failures="):
                             num_errs = line.split("=")[1].split(")")[0].strip()
-                            error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n")
+                            error_log += (
+                                "FAILED (failures=" 
+                                + str(int(num_errs) - num_skipped) 
+                                + f" skipped {num_skipped} cases" 
+                                + ")\n"
+                            )
 
     renamed_key = key.replace("../../../../", "").replace("/", "_")
     if num_err > 0:
@@ -102,7 +111,8 @@ def run(test_command):
         with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
             f.write(error_log)
     else:
-        import pdb;pdb.set_trace()
+        import pdb
+        pdb.set_trace()
         with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
             f.write(fail.stdout)
             f.write(fail.stderr)

From 14773da97d3d5e8a4231a2a027e32a7007892abb Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 17:23:49 +0000
Subject: [PATCH 35/83] fix lint issue

---
 test/xpu/run_distributed_local.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 9cd07cc55..014a81235 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -49,6 +49,7 @@
 
 from xpu_test_utils import launch_test
 
+
 # run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
@@ -84,10 +85,11 @@ def run(test_command):
                         if line.startswith("FAILED (failures="):
                             num_errs = line.split("=")[1].split(")")[0].strip()
                             error_log += (
-                                "FAILED (failures=" 
-                                + str(int(num_errs) - num_skipped) 
-                                + f" skipped {num_skipped} cases" 
-                                + ")\n")
+                                "FAILED (failures="
+                                + str(int(num_errs) - num_skipped)
+                                + f" skipped {num_skipped} cases"
+                                + ")\n"
+                            )
                         else:
                             error_log += line + "\n"
                 else:
@@ -99,9 +101,9 @@ def run(test_command):
                         if line.startswith("FAILED (failures="):
                             num_errs = line.split("=")[1].split(")")[0].strip()
                             error_log += (
-                                "FAILED (failures=" 
-                                + str(int(num_errs) - num_skipped) 
-                                + f" skipped {num_skipped} cases" 
+                                "FAILED (failures="
+                                + str(int(num_errs) - num_skipped)
+                                + f" skipped {num_skipped} cases"
                                 + ")\n"
                             )
 
@@ -112,6 +114,7 @@ def run(test_command):
             f.write(error_log)
     else:
         import pdb
+
         pdb.set_trace()
         with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
             f.write(fail.stdout)

From 5d9d94bea923f6c6d45f21c3ee5feac523e166b3 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 17:29:15 +0000
Subject: [PATCH 36/83] fix lint issue

---
 test/xpu/skip_list_dist_local.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 9ec4c59e0..d7ef3b461 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -17,8 +17,8 @@
         "test_checkpoint_submodule_use_reentrant_False_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
-       # https://github.com/intel/torch-xpu-ops/issues/1504
-       "test_ddp_parity_xpu",
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_ddp_parity_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": (

From 5197d87713e5d85bc81fc0866a67a14728f3390f Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 17:50:43 +0000
Subject: [PATCH 37/83] update

---
 .github/workflows/_linux_ut.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 937425699..44e9cf90c 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -313,7 +313,7 @@ jobs:
                 rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
           conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
           source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../ && rm -rf pytorch
+          cd ../ && sudo rm -rf pytorch
           pip install requests
           git clone https://github.com/pytorch/pytorch pytorch
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
@@ -532,7 +532,6 @@ jobs:
           sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
           sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
           mkdir -p ut_log/pytorch_distributed
-          mkdir -p ut_log/pytorch_distributed_summary
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu
           XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
           if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
@@ -540,8 +539,9 @@ jobs:
             exit 1
           fi
           python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
-          cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
           cd ${{ github.workspace }}
+          mkdir -p ut_log/pytorch_distributed_summary
+          cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
       - name: Distributed UT Test Results Summary
         run: |

From be64dbe1df07dc87502fdd255024a62c781f717b Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 18:01:42 +0000
Subject: [PATCH 38/83] update

---
 .github/workflows/_linux_ut.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 44e9cf90c..d79848ed1 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -60,7 +60,7 @@ jobs:
                 rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
           conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
           source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../ && rm -rf pytorch
+          cd ../ && sudo rm -rf pytorch
           pip install requests
           git clone https://github.com/pytorch/pytorch pytorch
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then

From 0e445777e5f6c7d8e3cd33bc2eb09c4352390512 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 18:23:09 +0000
Subject: [PATCH 39/83] update

---
 .github/workflows/_linux_build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index ae6c2064c..d1b9b98b0 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -63,7 +63,7 @@ jobs:
                 rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_build
           conda create -n xpu_build python=${{ inputs.python }} cmake=3.28 ninja -y
           source activate xpu_build
-          cd ../ && rm -rf pytorch
+          cd ../ && sudo rm -rf pytorch
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch

From d0a0609f33be30c748e9b5846f14e07dbccc9f5b Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Thu, 17 Apr 2025 01:30:28 +0000
Subject: [PATCH 40/83] comment pdb

---
 test/xpu/run_distributed_local.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 014a81235..9bdffb00c 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -113,9 +113,8 @@ def run(test_command):
         with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
             f.write(error_log)
     else:
-        import pdb
-
-        pdb.set_trace()
+        # import pdb
+        # pdb.set_trace()
         with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
             f.write(fail.stdout)
             f.write(fail.stderr)

From 65d1953354a7854bc62cb30baf0e5346858d2b1b Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Wed, 16 Apr 2025 18:18:08 -0700
Subject: [PATCH 41/83] align the path

---
 .github/workflows/_linux_ut.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index d79848ed1..5b81ce89d 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -532,16 +532,16 @@ jobs:
           sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
           sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
           mkdir -p ut_log/pytorch_distributed
+          mkdir -p ut_log/pytorch_distributed_summary
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu
           XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
           if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
             echo -e "[ERROR] XCCL is not enabled"
             exit 1
           fi
-          python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
-          cd ${{ github.workspace }}
-          mkdir -p ut_log/pytorch_distributed_summary
+          python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log  
           cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
+          cd ${{ github.workspace }}
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
       - name: Distributed UT Test Results Summary
         run: |

From 415abe78948428338bdf88891cb3b6b6bba7ec25 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Fri, 18 Apr 2025 02:17:54 -0700
Subject: [PATCH 42/83] Skipped error cases

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py |  1 -
 test/xpu/skip_list_dist_local.py  | 94 ++++++++++++++++++++++++++++---
 test/xpu/xpu_test_utils.py        |  7 ++-
 3 files changed, 89 insertions(+), 13 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index cc40373bc..cb0a5024b 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -102,7 +102,6 @@ def run(test_command):
         with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
             f.write(error_log)
     else:
-        import pdb;pdb.set_trace()
         with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
             f.write(fail.stdout)
             f.write(fail.stderr)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 9ec4c59e0..91af20a1a 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -134,7 +134,9 @@
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_fsdp_optimizer_overlap",
     ),
-    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": (
+        "test_buffer_dtype_no_root_handle",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_multi_forward_cpu",
@@ -247,6 +249,10 @@
         "test_gather_object_xpu",
         "test_gather_object_list_cpu",
         "test_gather_object_list_xpu",
+        # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal!
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_scatter_object_list_cpu",
+        "test_scatter_object_list_xpu",
     ),
     "../../../../test/distributed/test_compute_comm_reordering.py": None,
     "../../../../test/distributed/test_control_collectives.py": None,
@@ -365,11 +371,34 @@
         # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered.
         # https://github.com/intel/torch-xpu-ops/issues/1556
         "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output",
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_loss_parallel",
+        "test_mlp_training_is_seq_parallel_False_recompute_activation_False",
+        "test_mlp_training_is_seq_parallel_True_recompute_activation_False",
+        "test_transformer_req_grad_float64_thaw_all",
+        "test_transformer_training_is_seq_parallel_False_float64",
+        "test_transformer_training_is_seq_parallel_True_float64",
+        "test_sequence_parallel_style",
     ),
     "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
-    "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None,
+    "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_linear_col_wise_parallel",
+        "test_parallelize_mlp_with_module_api",
+        "test_parallelize_mlp_with_module_api_nested",
+        "test_parallelize_module_multi_wildcard",
+        "test_parallelize_module_src_data_rank",
+        "test_parallelize_module_with_digit",
+        "test_parallelize_module_with_question",
+        "test_parallelize_module_with_star",
+        "test_under_devicemesh_context",
+    ),
     "../../../../test/distributed/tensor/parallel/test_tp_style.py": None,
-    "../../../../test/distributed/tensor/test_api.py": None,
+    "../../../../test/distributed/tensor/test_api.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_distribute_tensor_rank",
+        "test_distribute_tensor_uneven_sharding",
+    ),
     "../../../../test/distributed/tensor/test_attention.py": None,
     "../../../../test/distributed/tensor/test_common_rules.py": None,
     "../../../../test/distributed/tensor/test_dtensor.py": (
@@ -401,6 +430,15 @@
         # https://github.com/intel/torch-xpu-ops/issues/1508
         "test_mean",
         "test_nll_loss_and_cross_entropy",
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_cumsum",
+        "test_layer_norm_bwd",
+        "test_layer_norm_bwd_req_grad",
+        "test_layer_norm_fwd",
+        "test_linear_op_reductions",
+        "test_shard0_svd",
+        "test_softmax_fwd",
+        "test_topk",
     ),
     "../../../../test/distributed/tensor/test_random_ops.py": (
         # Need to update world size
@@ -409,12 +447,39 @@
     "../../../../test/distributed/tensor/test_redistribute.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_redistribute_shard_dim_multi_dim_mesh",
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_redistribute_shard_dim_change",
+        "test_redistribute_uneven_sharding",
+        "test_shard_to_replicate_forward_backward",
+        "test_shard_to_replicate_forward_backward_datatype_conversion",
+        "test_multi_dim_mesh",
+    ),
+    "../../../../test/distributed/tensor/test_tensor_ops.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_aten_contiguous",
+        "test_gather",
+        "test_index",
+        "test_op_out_variant",
+        "test_slice",
+        "test_stack",
+        "test_where_type_promotion",
+    ),
+    "../../../../test/distributed/tensor/experimental/test_register_sharding.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_argmax",
+        "test_softmax_fwd",
+    ),
+    "../../../../test/distributed/_shard/test_sharder.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_custom_sharder",
     ),
-    "../../../../test/distributed/tensor/test_tensor_ops.py": None,
-    "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None,
-    "../../../../test/distributed/_shard/test_sharder.py": None,
     # FSDP2
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_nontensor_activations",
+        "test_unused_forward_module",
+        "test_unused_forward_output",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_clip_grad_norm_2d",
@@ -425,7 +490,10 @@
     ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_train_mixed_requires_grad_per_group",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1508
         "test_gradient_scaler",
@@ -437,7 +505,12 @@
         # https://github.com/intel/torch-xpu-ops/issues/1535
         "test_fully_shard_training_memory",
     ),
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_compute_dtype",
+        "test_grad_acc_with_reduce_dtype",
+        "test_reduce_dtype",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": (
         # Performance test, should skip
         "test_fully_shard_training_overlap",
@@ -453,6 +526,9 @@
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_train_parity_multi_group_unshard_async_op",
         "test_train_parity_with_activation_checkpointing",
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_1f1b_microbatching",
+        "test_gradient_accumulation",
     ),
 }
 
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 92a91355a..d58d3d9a5 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -1153,6 +1153,7 @@ def copy_tests(
 def launch_test(test_case, skip_list=None, exe_list=None):
     os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1"
     os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
+    rename = test_case.replace("../../../../", "").replace("/", "_")
     if skip_list is not None:
         skip_options = ' -k "not ' + skip_list[0]
         for skip_case in skip_list[1:]:
@@ -1160,7 +1161,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             skip_options += skip_option
         skip_options += '"'
         test_command = (
-            f"pytest -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + test_case
+            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case
         )
         test_command += skip_options
     elif exe_list is not None:
@@ -1170,11 +1171,11 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             exe_options += exe_option
         exe_options += '"'
         test_command = (
-            f"pytest -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + test_case
+            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case
         )
         test_command += exe_options
     else:
         test_command = (
-            f"pytest -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + test_case
+            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case
         )
     return os.system(test_command)

From c555fbba106852870051801e7c187228b65c7791 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Fri, 18 Apr 2025 02:31:41 -0700
Subject: [PATCH 43/83] fixed lint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/xpu_test_utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 79a2b6eb6..8d451523a 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -1172,13 +1172,11 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             exe_options += exe_option
         exe_options += '"'
         test_command = (
-            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml "
-            + test_case
+            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case
         )
         test_command += exe_options
     else:
         test_command = (
-            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml "
-            + test_case
+            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case
         )
     return os.system(test_command)

From 6d6a75e7262f176ee76670d09eeaad26c72d8669 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Fri, 18 Apr 2025 02:33:55 -0700
Subject: [PATCH 44/83] fixed lint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/xpu_test_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 8d451523a..d58d3d9a5 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -1161,8 +1161,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             skip_options += skip_option
         skip_options += '"'
         test_command = (
-            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml "
-            + test_case
+            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case
         )
         test_command += skip_options
     elif exe_list is not None:

From 1f451b2907297085322ea4133dbeb133434a3b60 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Wed, 23 Apr 2025 23:23:57 -0700
Subject: [PATCH 45/83] Add some UT cases

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 48 +++++++++++++++++---------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 4dd928ea9..49584c65a 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -282,16 +282,22 @@
         "test_fsdp_setattr",
         "test_fsdp_unspecialized_forced_getattr_inline",
         "test_fsdp_unspecialized_forced_getattr_no_inline",
-        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
-        # https://github.com/intel/torch-xpu-ops/issues/1526
-        "test_get_pg_attr",
     ),
     "../../../../test/distributed/test_fake_pg.py": None,
-    "../../../../test/distributed/test_functional_api.py": (
-        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
-        # https://github.com/intel/torch-xpu-ops/issues/1526
-        "test_tracing_xpu",
-        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu",
+    "../../../../test/distributed/test_functional_api.py": None,
+    "../../../../test/distributed/test_inductor_collectives.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1581
+        # Fatal Python error: Segmentation fault
+        "test_dynamo_rewrite_dist_all_gather",
+        "test_dynamo_rewrite_dist_all_gather_list",
+        "test_dynamo_rewrite_dist_all_gather_args_match",
+        "test_dynamo_rewrite_dist_reduce_scatter",
+        "test_dynamo_support_collective_op_with_async_op_False",
+        "test_dynamo_trace_reduce_scatter_tensor",
+        "test_dynamo_trace_all_gather_tensor",
+        "test_dynamo_trace_allgather_coalesced",
+        "test_inductor_reduce_scatter_coalesced",
+        "test_inductor_all_gather_coalesced",
     ),
     "../../../../test/distributed/test_multi_threaded_pg.py": (
         # oneccl not support multi-threaded well, so skip it first.
@@ -401,21 +407,7 @@
     ),
     "../../../../test/distributed/tensor/test_attention.py": None,
     "../../../../test/distributed/tensor/test_common_rules.py": None,
-    "../../../../test/distributed/tensor/test_dtensor.py": (
-        # Passed with updated test code for world_size 8
-        "test_auto_implicit_replication",
-        "test_default_value_sub_mesh",
-        "test_device_mesh_nd",
-        "test_dtensor_2d_mesh",
-        "test_dtensor_api_device_mesh_context_manager",
-        "test_dtensor_device_mesh_device_conversion",
-        "test_dtensor_spec_local_shard_offset",
-        "test_from_local_sub_mesh",
-        "test_implicit_replication",
-        "test_metadata_consistency_check",
-        "test_redistribute_sub_mesh",
-        "test_split_tensor_1D",
-    ),
+    "../../../../test/distributed/tensor/test_dtensor.py": None,
     "../../../../test/distributed/tensor/test_dtensor_compile.py": (
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_2d_fsdp_tp_compile",
@@ -530,6 +522,16 @@
         "test_1f1b_microbatching",
         "test_gradient_accumulation",
     ),
+    "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        "test_tracker_multi_group_eager",
+        "test_tracker_non_root_forward_backward",
+        "test_tracker_with_activation_checkpointing",
+    ),
+    "../../../../test/distributed/_tools/test_mem_tracker.py": None,
+    "../../../../test/distributed/_tools/test_memory_tracker.py": None,
+    "../../../../test/distributed/_tools/test_mod_tracker.py": None,
 }
 
 skip_dict_python = {

From b2c5875eb61426dfb162d68af4d056d9f94ca07a Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Tue, 29 Apr 2025 03:10:18 -0700
Subject: [PATCH 46/83] Add UT cases for _shard and _tools folder

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 38 ++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 49584c65a..9626f6e35 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -461,10 +461,6 @@
         "test_argmax",
         "test_softmax_fwd",
     ),
-    "../../../../test/distributed/_shard/test_sharder.py": (
-        # https://jira.devtools.intel.com/browse/MLSL-3625
-        "test_custom_sharder",
-    ),
     # FSDP2
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": (
         # https://jira.devtools.intel.com/browse/MLSL-3625
@@ -522,6 +518,40 @@
         "test_1f1b_microbatching",
         "test_gradient_accumulation",
     ),
+    "../../../../test/distributed/_composable/test_replicate_with_compiler.py": (
+        # AssertionError: Tensor-likes are not close!
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+        "test_compile_backward_only",
+        "test_compile_bf16",
+        "test_compile_fp16",
+        "test_compile_gpu",
+        "test_compile_gpu_ac",
+    ),
+    "../../../../test/distributed/_shard/test_sharder.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_custom_sharder",
+    ),
+    "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None,
+    "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": {
+        # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
+        "test_complete_world_size",
+        "test_multiple_local_shards",
+        "test_new_group",
+        "test_partial_world_size",
+        "test_grid_sharding",
+        "test_multiple_local_shards",
+        "test_new_group",
+        "test_partial_world_size",
+        "test_with_rpc_names",
+        "test_init_from_local_tensor",
+        # what():  Attempting to send a Tensor with unexpected device type xpu:3
+        # https://github.com/intel/torch-xpu-ops/issues/1616
+        "test_init_from_local_shards",
+        "test_init_from_local_shards_and_global_metadata",
+    },
+    "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None,
+    "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None,
+    "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None,
     "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1508
         # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path

From 177d7c0cc0f3e9223695e3e5b6ee782a9564f3e8 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Mon, 5 May 2025 03:18:25 -0700
Subject: [PATCH 47/83] Clean skip list

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 141 ++++++++++++++++---------------
 1 file changed, 73 insertions(+), 68 deletions(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 9626f6e35..0254e69a3 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -9,7 +9,7 @@
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False",
+        # "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False",
         "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
         "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False",
         "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
@@ -37,7 +37,7 @@
     ),
     "../../../../test/distributed/fsdp/test_fsdp_core.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_delayed_optim_step_offload_true_no_shard_xpu",
+        # "test_delayed_optim_step_offload_true_no_shard_xpu",
         "test_transformer_no_grad_mixed_precision_True_xpu",
         "test_delayed_optim_step_offload_false_no_shard_xpu",
         "test_delayed_optim_step_offload_false_none_xpu",
@@ -74,10 +74,11 @@
         "test_transformer_offload_true_none_xpu",
         "test_transformer_offload_true_shard_grad_op_xpu",
     ),
-    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
+    # (
+    #     # https://github.com/intel/torch-xpu-ops/issues/1504
+    #     " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
@@ -129,14 +130,15 @@
     "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1535
-        "test_fsdp_zero2_eval_with_prefetch",
+        # # https://github.com/intel/torch-xpu-ops/issues/1535
+        # "test_fsdp_zero2_eval_with_prefetch",
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_fsdp_optimizer_overlap",
     ),
-    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": (
-        "test_buffer_dtype_no_root_handle",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
+    # (
+    #     "test_buffer_dtype_no_root_handle",
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_multi_forward_cpu",
@@ -225,30 +227,30 @@
         # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
         "test_reduce_scatter_tensor_coalesced",
         "test_reduce_scatter_tensor_single",
-        # https://github.com/intel/torch-xpu-ops/issues/1525
-        # ValueError: trying to initialize the default process group twice!
-        "test_inductor_all_gather_into_tensor_coalesced",
-        "test_inductor_all_gather_into_tensor_single",
-        "test_inductor_all_reduce_coalesced",
-        "test_inductor_all_reduce_non_contig_input",
-        "test_inductor_all_reduce_single",
-        "test_inductor_all_to_all_single",
-        "test_inductor_broadcast",
-        "test_inductor_inplace_op_on_view",
-        "test_inductor_reduce_scatter_tensor_coalesced",
-        "test_inductor_reduce_scatter_tensor_single",
-        "test_inductor_reuse_buffer_after_inplace_collective",
-        "test_ranks_and_tag",
-        "test_wait_tensor",
+        # # https://github.com/intel/torch-xpu-ops/issues/1525
+        # # ValueError: trying to initialize the default process group twice!
+        # "test_inductor_all_gather_into_tensor_coalesced",
+        # "test_inductor_all_gather_into_tensor_single",
+        # "test_inductor_all_reduce_coalesced",
+        # "test_inductor_all_reduce_non_contig_input",
+        # "test_inductor_all_reduce_single",
+        # "test_inductor_all_to_all_single",
+        # "test_inductor_broadcast",
+        # "test_inductor_inplace_op_on_view",
+        # "test_inductor_reduce_scatter_tensor_coalesced",
+        # "test_inductor_reduce_scatter_tensor_single",
+        # "test_inductor_reuse_buffer_after_inplace_collective",
+        # "test_ranks_and_tag",
+        # "test_wait_tensor",
     ),
     "../../../../test/distributed/test_c10d_logger.py": None,
     "../../../../test/distributed/test_c10d_object_collectives.py": (
-        # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds
-        # https://github.com/intel/torch-xpu-ops/issues/1535
-        "test_gather_object_cpu",
-        "test_gather_object_xpu",
-        "test_gather_object_list_cpu",
-        "test_gather_object_list_xpu",
+        # # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds
+        # # https://github.com/intel/torch-xpu-ops/issues/1535
+        # "test_gather_object_cpu",
+        # "test_gather_object_xpu",
+        # "test_gather_object_list_cpu",
+        # "test_gather_object_list_xpu",
         # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal!
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_scatter_object_list_cpu",
@@ -258,30 +260,30 @@
     "../../../../test/distributed/test_control_collectives.py": None,
     "../../../../test/distributed/test_device_mesh.py": None,
     "../../../../test/distributed/test_dynamo_distributed.py": (
-        # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout'
-        "test_asymmetric_compilation",
-        "test_asymmetric_compilation_with_fx_cache",
-        # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
-        "test_compiled_flex_attention_full_model_ddp",
-        "test_compiled_flex_attention_local_ddp",
+        # # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout'
+        # "test_asymmetric_compilation",
+        # "test_asymmetric_compilation_with_fx_cache",
+        # # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
+        # "test_compiled_flex_attention_full_model_ddp",
+        # "test_compiled_flex_attention_local_ddp",
         # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__
         # https://github.com/intel/torch-xpu-ops/issues/1527
-        "test_compiler_collectives_automatic_dynamic_scalar",
-        "test_compiler_collectives_automatic_dynamic_speculation_divergence",
-        "test_compiler_collectives_automatic_dynamic_tensor",
-        "test_compiler_collectives_dim_mismatch",
-        "test_compiler_collectives_graph_break_empty_graph_still_collective",
-        "test_compiler_collectives_missing_source",
-        "test_compiler_collectives_scalar_missing_source",
-        "test_compiler_collectives_type_mismatch",
-        "test_ddp_activation_checkpointing",
-        "test_ddp_baseline_aot_eager_multiprocess",
-        "test_fsdp_activation_checkpointing",
-        "test_fsdp_aot_eager",
-        "test_fsdp_inductor",
+        # "test_compiler_collectives_automatic_dynamic_scalar",
+        # "test_compiler_collectives_automatic_dynamic_speculation_divergence",
+        # "test_compiler_collectives_automatic_dynamic_tensor",
+        # "test_compiler_collectives_dim_mismatch",
+        # "test_compiler_collectives_graph_break_empty_graph_still_collective",
+        # "test_compiler_collectives_missing_source",
+        # "test_compiler_collectives_scalar_missing_source",
+        # "test_compiler_collectives_type_mismatch",
+        # "test_ddp_activation_checkpointing",
+        # "test_ddp_baseline_aot_eager_multiprocess",
+        # "test_fsdp_activation_checkpointing",
+        # "test_fsdp_aot_eager",
+        # "test_fsdp_inductor",
         "test_fsdp_setattr",
-        "test_fsdp_unspecialized_forced_getattr_inline",
-        "test_fsdp_unspecialized_forced_getattr_no_inline",
+        # "test_fsdp_unspecialized_forced_getattr_inline",
+        # "test_fsdp_unspecialized_forced_getattr_no_inline",
     ),
     "../../../../test/distributed/test_fake_pg.py": None,
     "../../../../test/distributed/test_functional_api.py": None,
@@ -299,11 +301,12 @@
         "test_inductor_reduce_scatter_coalesced",
         "test_inductor_all_gather_coalesced",
     ),
-    "../../../../test/distributed/test_multi_threaded_pg.py": (
-        # oneccl not support multi-threaded well, so skip it first.
-        # https://github.com/intel/torch-xpu-ops/issues/1509
-        "test_bwd_sees_fwd_pg",
-    ),
+    "../../../../test/distributed/test_multi_threaded_pg.py": None,
+    # (
+    #     # oneccl not support multi-threaded well, so skip it first.
+    #     # https://github.com/intel/torch-xpu-ops/issues/1509
+    #     "test_bwd_sees_fwd_pg",
+    # ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
@@ -408,10 +411,11 @@
     "../../../../test/distributed/tensor/test_attention.py": None,
     "../../../../test/distributed/tensor/test_common_rules.py": None,
     "../../../../test/distributed/tensor/test_dtensor.py": None,
-    "../../../../test/distributed/tensor/test_dtensor_compile.py": (
-        # https://jira.devtools.intel.com/browse/MLSL-3625
-        "test_2d_fsdp_tp_compile",
-    ),
+    "../../../../test/distributed/tensor/test_dtensor_compile.py": None,
+    # (
+    #     # https://jira.devtools.intel.com/browse/MLSL-3625
+    #     "test_2d_fsdp_tp_compile",
+    # ),
     "../../../../test/distributed/tensor/test_experimental_ops.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1535
         "test_bernoulli",
@@ -451,10 +455,10 @@
         "test_aten_contiguous",
         "test_gather",
         "test_index",
-        "test_op_out_variant",
+        # "test_op_out_variant",
         "test_slice",
         "test_stack",
-        "test_where_type_promotion",
+        # "test_where_type_promotion",
     ),
     "../../../../test/distributed/tensor/experimental/test_register_sharding.py": (
         # https://jira.devtools.intel.com/browse/MLSL-3625
@@ -482,10 +486,11 @@
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_train_mixed_requires_grad_per_group",
     ),
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1508
-        "test_gradient_scaler",
-    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None,
+    # (
+    #     # https://github.com/intel/torch-xpu-ops/issues/1508
+    #     "test_gradient_scaler",
+    # ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None,

From 939352d8723082e6ae8b0570653214164e69e92d Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Wed, 14 May 2025 20:54:51 -0700
Subject: [PATCH 48/83] clean skip list for distributed

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 152 +++++++++++--------------------
 1 file changed, 51 insertions(+), 101 deletions(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 0254e69a3..5f70354f8 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -8,8 +8,8 @@
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        # "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False",
+        # https://github.com/intel/torch-xpu-ops/issues/1666, 2.8 skipped
+        "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False",
         "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
         "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False",
         "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
@@ -37,7 +37,6 @@
     ),
     "../../../../test/distributed/fsdp/test_fsdp_core.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
-        # "test_delayed_optim_step_offload_true_no_shard_xpu",
         "test_transformer_no_grad_mixed_precision_True_xpu",
         "test_delayed_optim_step_offload_false_no_shard_xpu",
         "test_delayed_optim_step_offload_false_none_xpu",
@@ -75,10 +74,6 @@
         "test_transformer_offload_true_shard_grad_op_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
-    # (
-    #     # https://github.com/intel/torch-xpu-ops/issues/1504
-    #     " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    # ),
     "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
@@ -130,21 +125,15 @@
     "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
-        # # https://github.com/intel/torch-xpu-ops/issues/1535
-        # "test_fsdp_zero2_eval_with_prefetch",
-        # https://github.com/intel/torch-xpu-ops/issues/1504
+        # https://github.com/intel/torch-xpu-ops/issues/1504, Performance test, should skip
         "test_fsdp_optimizer_overlap",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
-    # (
-    #     "test_buffer_dtype_no_root_handle",
-    # ),
     "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_multi_forward_cpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
-    # https://github.com/intel/torch-xpu-ops/issues/1537
     "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_flatten_sharded_optim_state_dict_nested",
@@ -245,12 +234,6 @@
     ),
     "../../../../test/distributed/test_c10d_logger.py": None,
     "../../../../test/distributed/test_c10d_object_collectives.py": (
-        # # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds
-        # # https://github.com/intel/torch-xpu-ops/issues/1535
-        # "test_gather_object_cpu",
-        # "test_gather_object_xpu",
-        # "test_gather_object_list_cpu",
-        # "test_gather_object_list_xpu",
         # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal!
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_scatter_object_list_cpu",
@@ -260,35 +243,14 @@
     "../../../../test/distributed/test_control_collectives.py": None,
     "../../../../test/distributed/test_device_mesh.py": None,
     "../../../../test/distributed/test_dynamo_distributed.py": (
-        # # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout'
-        # "test_asymmetric_compilation",
-        # "test_asymmetric_compilation_with_fx_cache",
-        # # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
-        # "test_compiled_flex_attention_full_model_ddp",
-        # "test_compiled_flex_attention_local_ddp",
-        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__
-        # https://github.com/intel/torch-xpu-ops/issues/1527
-        # "test_compiler_collectives_automatic_dynamic_scalar",
-        # "test_compiler_collectives_automatic_dynamic_speculation_divergence",
-        # "test_compiler_collectives_automatic_dynamic_tensor",
-        # "test_compiler_collectives_dim_mismatch",
-        # "test_compiler_collectives_graph_break_empty_graph_still_collective",
-        # "test_compiler_collectives_missing_source",
-        # "test_compiler_collectives_scalar_missing_source",
-        # "test_compiler_collectives_type_mismatch",
-        # "test_ddp_activation_checkpointing",
-        # "test_ddp_baseline_aot_eager_multiprocess",
-        # "test_fsdp_activation_checkpointing",
-        # "test_fsdp_aot_eager",
-        # "test_fsdp_inductor",
+        # AssertionError: 'setattr() on Tensor.requires_grad' not found in 'Attempted to call function marked as skipped
+        # https://github.com/intel/torch-xpu-ops/issues/1667, 2.8 skipped
         "test_fsdp_setattr",
-        # "test_fsdp_unspecialized_forced_getattr_inline",
-        # "test_fsdp_unspecialized_forced_getattr_no_inline",
     ),
     "../../../../test/distributed/test_fake_pg.py": None,
     "../../../../test/distributed/test_functional_api.py": None,
     "../../../../test/distributed/test_inductor_collectives.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1581
+        # https://github.com/intel/torch-xpu-ops/issues/1581, 2.8 skipped
         # Fatal Python error: Segmentation fault
         "test_dynamo_rewrite_dist_all_gather",
         "test_dynamo_rewrite_dist_all_gather_list",
@@ -302,22 +264,19 @@
         "test_inductor_all_gather_coalesced",
     ),
     "../../../../test/distributed/test_multi_threaded_pg.py": None,
+    "../../../../test/distributed/test_store.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
     # (
-    #     # oneccl not support multi-threaded well, so skip it first.
-    #     # https://github.com/intel/torch-xpu-ops/issues/1509
-    #     "test_bwd_sees_fwd_pg",
+    #     # https://github.com/intel/torch-xpu-ops/issues/1504
+    #     "test_stage_backward_weight_multiple_iters_xpu",
+    #     "test_stage_backward_weight_xpu",
+    #     "test_stage_backward_xpu",
+    # ),
+    "../../../../test/distributed/pipelining/test_microbatch.py": None,
+    # (
+    #     # https://github.com/intel/torch-xpu-ops/issues/1504, need retest with oneccl fix
+    #     "test_chunk_spec_xpu",
     # ),
-    "../../../../test/distributed/test_store.py": None,
-    "../../../../test/distributed/pipelining/test_backward.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_stage_backward_weight_multiple_iters_xpu",
-        "test_stage_backward_weight_xpu",
-        "test_stage_backward_xpu",
-    ),
-    "../../../../test/distributed/pipelining/test_microbatch.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_chunk_spec_xpu",
-    ),
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
@@ -325,7 +284,7 @@
     "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": (
         # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter'
         # is not currently implemented for the XPU device
-        # https://github.com/intel/torch-xpu-ops/issues/1547
+        # https://github.com/intel/torch-xpu-ops/issues/1547, 2.8 skipped
         "test_dtensor_seq_par_shard_dim_0",
         "test_dtensor_seq_par_shard_dim_1",
         "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
@@ -334,7 +293,7 @@
         "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
         "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
         # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......'
-        # https://github.com/intel/torch-xpu-ops/issues/1548
+        # https://github.com/intel/torch-xpu-ops/issues/1548, 2.8 skipped
         "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False",
         "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True",
         "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False",
@@ -342,7 +301,7 @@
         "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False",
         "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True",
         # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......'
-        # https://github.com/intel/torch-xpu-ops/issues/1549
+        # https://github.com/intel/torch-xpu-ops/issues/1549, 2.8 skipped
         "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False",
         "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True",
         "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False",
@@ -350,14 +309,14 @@
         "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False",
         "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True",
         # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device.
-        # https://github.com/intel/torch-xpu-ops/issues/1550
+        # https://github.com/intel/torch-xpu-ops/issues/1550, 2.8 skipped
         "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False",
         "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True",
         "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False",
         "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True",
         # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter'
         # is not currently implemented for the XPU device.
-        # https://github.com/intel/torch-xpu-ops/issues/1551
+        # https://github.com/intel/torch-xpu-ops/issues/1551, 2.8 skipped
         "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
         "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
         "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
@@ -369,7 +328,7 @@
     ),
     "../../../../test/distributed/tensor/parallel/test_tp_examples.py": (
         # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!
-        # https://github.com/intel/torch-xpu-ops/issues/1555
+        # https://github.com/intel/torch-xpu-ops/issues/1555, 2.8 skipped
         "test_transformer_req_grad_seq_parallel_float32_thaw_all",
         "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings",
         "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings",
@@ -378,7 +337,7 @@
         "test_transformer_training_is_seq_parallel_False_float32",
         "test_transformer_training_is_seq_parallel_True_float32",
         # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered.
-        # https://github.com/intel/torch-xpu-ops/issues/1556
+        # https://github.com/intel/torch-xpu-ops/issues/1556, 2.8 skipped
         "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output",
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_loss_parallel",
@@ -412,18 +371,13 @@
     "../../../../test/distributed/tensor/test_common_rules.py": None,
     "../../../../test/distributed/tensor/test_dtensor.py": None,
     "../../../../test/distributed/tensor/test_dtensor_compile.py": None,
-    # (
-    #     # https://jira.devtools.intel.com/browse/MLSL-3625
-    #     "test_2d_fsdp_tp_compile",
-    # ),
     "../../../../test/distributed/tensor/test_experimental_ops.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1535
+        # https://github.com/intel/torch-xpu-ops/issues/1604
         "test_bernoulli",
     ),
     "../../../../test/distributed/tensor/test_init.py": None,
     "../../../../test/distributed/tensor/test_math_ops.py": (
-        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
-        # https://github.com/intel/torch-xpu-ops/issues/1508
+        # https://github.com/intel/torch-xpu-ops/issues/1604, hang
         "test_mean",
         "test_nll_loss_and_cross_entropy",
         # https://jira.devtools.intel.com/browse/MLSL-3625
@@ -436,13 +390,8 @@
         "test_softmax_fwd",
         "test_topk",
     ),
-    "../../../../test/distributed/tensor/test_random_ops.py": (
-        # Need to update world size
-        "test_hsdp_tp_model_meta_init",
-    ),
+    "../../../../test/distributed/tensor/test_random_ops.py": None,
     "../../../../test/distributed/tensor/test_redistribute.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_redistribute_shard_dim_multi_dim_mesh",
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_redistribute_shard_dim_change",
         "test_redistribute_uneven_sharding",
@@ -455,10 +404,9 @@
         "test_aten_contiguous",
         "test_gather",
         "test_index",
-        # "test_op_out_variant",
         "test_slice",
         "test_stack",
-        # "test_where_type_promotion",
+        "test_where_type_promotion",
     ),
     "../../../../test/distributed/tensor/experimental/test_register_sharding.py": (
         # https://jira.devtools.intel.com/browse/MLSL-3625
@@ -473,29 +421,31 @@
         "test_unused_forward_output",
     ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1504
+        # https://github.com/intel/torch-xpu-ops/issues/1661
         "test_clip_grad_norm_2d",
     ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1571
+        # ValueError: Cannot use ReduceOp.PREMUL_SUM with XCCL 
+        # https://github.com/intel/torch-xpu-ops/issues/1571, 2.8 skipped
         "test_set_reduce_scatter_divide_factor",
     ),
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": (
+        # torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised
+        # https://github.com/intel/torch-xpu-ops/issues/1665, 2.8 skipped
+        "test_transformer_backend_inductor_fullgraph_True",
+        "test_nested_fully_shard_backend_inductor_fullgraph_True",
+    ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": (
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_train_mixed_requires_grad_per_group",
     ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None,
-    # (
-    #     # https://github.com/intel/torch-xpu-ops/issues/1508
-    #     "test_gradient_scaler",
-    # ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1535
+        # https://github.com/intel/torch-xpu-ops/issues/1605
         "test_fully_shard_training_memory",
     ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": (
@@ -509,15 +459,15 @@
         "test_fully_shard_training_overlap",
     ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1572
+        # Expected zero exit code but got -9
+        # https://github.com/intel/torch-xpu-ops/issues/1663
         "test_dp_state_dict_cpu_offload",
     ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1508
-        "test_post_optim_event",
-        # https://github.com/intel/torch-xpu-ops/issues/1504
+        # https://github.com/intel/torch-xpu-ops/issues/1661
         "test_train_parity_multi_group_unshard_async_op",
+        # checkpointing issue, 2.8 skipped
         "test_train_parity_with_activation_checkpointing",
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_1f1b_microbatching",
@@ -525,7 +475,7 @@
     ),
     "../../../../test/distributed/_composable/test_replicate_with_compiler.py": (
         # AssertionError: Tensor-likes are not close!
-        # https://github.com/intel/torch-xpu-ops/issues/1504
+        # https://github.com/intel/torch-xpu-ops/issues/1668, 2.8 skipped
         "test_compile_backward_only",
         "test_compile_bf16",
         "test_compile_fp16",
@@ -539,6 +489,7 @@
     "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None,
     "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": {
         # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
+        # https://github.com/intel/torch-xpu-ops/issues/1617, 2.8 skipped
         "test_complete_world_size",
         "test_multiple_local_shards",
         "test_new_group",
@@ -550,20 +501,19 @@
         "test_with_rpc_names",
         "test_init_from_local_tensor",
         # what():  Attempting to send a Tensor with unexpected device type xpu:3
-        # https://github.com/intel/torch-xpu-ops/issues/1616
+        # https://github.com/intel/torch-xpu-ops/issues/1616, 2.8 skipped
         "test_init_from_local_shards",
         "test_init_from_local_shards_and_global_metadata",
     },
     "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None,
     "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None,
     "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None,
-    "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1508
-        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
-        "test_tracker_multi_group_eager",
-        "test_tracker_non_root_forward_backward",
-        "test_tracker_with_activation_checkpointing",
-    ),
+    "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
+    # (
+    #     # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+    #     # https://github.com/intel/torch-xpu-ops/issues/1508, 2.8 skipped
+    #     "test_tracker_with_activation_checkpointing",
+    # ),
     "../../../../test/distributed/_tools/test_mem_tracker.py": None,
     "../../../../test/distributed/_tools/test_memory_tracker.py": None,
     "../../../../test/distributed/_tools/test_mod_tracker.py": None,

From 1533b9b70f364f5b48693d22858143d87d37373b Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Wed, 14 May 2025 21:31:30 -0700
Subject: [PATCH 49/83] Add comments for skip list

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 5f70354f8..d99dd2bd0 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -8,6 +8,7 @@
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
+        # Accuracy gap in FSDP checkpoint related UT
         # https://github.com/intel/torch-xpu-ops/issues/1666, 2.8 skipped
         "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False",
         "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
@@ -17,11 +18,13 @@
         "test_checkpoint_submodule_use_reentrant_False_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_ddp_parity_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_bf16_hook_has_wrapping_False_sharding_strategy0",
         "test_bf16_hook_has_wrapping_False_sharding_strategy1",
@@ -36,6 +39,7 @@
         "test_fp16_hook_has_wrapping_True_sharding_strategy2",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_transformer_no_grad_mixed_precision_True_xpu",
         "test_delayed_optim_step_offload_false_no_shard_xpu",
@@ -76,6 +80,7 @@
     "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_hooks_multi_traversal_xpu",
         "test_parity_with_ddp_xpu",
@@ -83,6 +88,7 @@
     ),
     "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
         "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
@@ -125,16 +131,19 @@
     "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504, Performance test, should skip
         "test_fsdp_optimizer_overlap",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_multi_forward_cpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_flatten_sharded_optim_state_dict_nested",
         "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False",
@@ -176,12 +185,14 @@
     ),
     # Performance check, skip
     # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
-    #    # https://github.com/intel/torch-xpu-ops/issues/1504
+    #    # fsdp accuracy gaps
+        # https://github.com/intel/torch-xpu-ops/issues/1504
     #    "test_forward_overlap",
     #    "test_forward_overlap_xpu",
     # ),
     "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none",
         "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none",
@@ -189,6 +200,7 @@
         "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_state_dict_save_load_flow_state_dict_type_local_state_dict",
         "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict",
@@ -199,6 +211,7 @@
     "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": (
+        # fsdp accuracy gaps
         # https://github.com/intel/torch-xpu-ops/issues/1504
         "test_diff_hyperparams_sharding_strategy_str_full_shard",
         "test_diff_hyperparams_sharding_strategy_str_no_shard",
@@ -267,14 +280,16 @@
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
     # (
-    #     # https://github.com/intel/torch-xpu-ops/issues/1504
+    #     # fsdp accuracy gaps
+          # https://github.com/intel/torch-xpu-ops/issues/1504
     #     "test_stage_backward_weight_multiple_iters_xpu",
     #     "test_stage_backward_weight_xpu",
     #     "test_stage_backward_xpu",
     # ),
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     # (
-    #     # https://github.com/intel/torch-xpu-ops/issues/1504, need retest with oneccl fix
+    #     # fsdp accuracy gaps
+          # https://github.com/intel/torch-xpu-ops/issues/1504, need retest with oneccl fix
     #     "test_chunk_spec_xpu",
     # ),
     "../../../../test/distributed/pipelining/test_pipe.py": None,

From d74615fea55069839956d58c2b1a88d48cd65ec9 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 15 May 2025 23:52:31 -0700
Subject: [PATCH 50/83] move some issues from skip list to known issues report

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 230 +++----------------------------
 1 file changed, 16 insertions(+), 214 deletions(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index d99dd2bd0..05942858c 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -17,112 +17,15 @@
         "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
         "test_checkpoint_submodule_use_reentrant_False_xpu",
     ),
-    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
-        # fsdp accuracy gaps
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_ddp_parity_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": (
-        # fsdp accuracy gaps
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_bf16_hook_has_wrapping_False_sharding_strategy0",
-        "test_bf16_hook_has_wrapping_False_sharding_strategy1",
-        "test_bf16_hook_has_wrapping_False_sharding_strategy2",
-        "test_bf16_hook_has_wrapping_True_sharding_strategy0",
-        "test_bf16_hook_has_wrapping_True_sharding_strategy1",
-        "test_bf16_hook_has_wrapping_True_sharding_strategy2",
-        "test_fp16_hook_has_wrapping_False_sharding_strategy1",
-        "test_fp16_hook_has_wrapping_False_sharding_strategy2",
-        "test_fp16_hook_has_wrapping_True_sharding_strategy0",
-        "test_fp16_hook_has_wrapping_True_sharding_strategy1",
-        "test_fp16_hook_has_wrapping_True_sharding_strategy2",
-    ),
-    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
-        # fsdp accuracy gaps
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_transformer_no_grad_mixed_precision_True_xpu",
-        "test_delayed_optim_step_offload_false_no_shard_xpu",
-        "test_delayed_optim_step_offload_false_none_xpu",
-        "test_delayed_optim_step_offload_false_shard_grad_op_xpu",
-        "test_delayed_optim_step_offload_true_none_xpu",
-        "test_delayed_optim_step_offload_true_shard_grad_op_xpu",
-        "test_delayed_reduce_scatter_offload_false_no_shard_xpu",
-        "test_delayed_reduce_scatter_offload_false_none_xpu",
-        "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
-        "test_delayed_reduce_scatter_offload_true_none_xpu",
-        "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
-        "test_mixture_of_experts_offload_false_no_shard_xpu",
-        "test_mixture_of_experts_offload_false_none_xpu",
-        "test_mixture_of_experts_offload_false_shard_grad_op_xpu",
-        "test_mixture_of_experts_offload_true_none_xpu",
-        "test_mixture_of_experts_offload_true_shard_grad_op_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
-        "test_nested_always_wrap_model_offload_false_no_shard_xpu",
-        "test_nested_always_wrap_model_offload_false_none_xpu",
-        "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
-        "test_nested_always_wrap_model_offload_true_none_xpu",
-        "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
-        "test_nested_wrapped_model_offload_false_no_shard_xpu",
-        "test_nested_wrapped_model_offload_false_none_xpu",
-        "test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
-        "test_nested_wrapped_model_offload_true_none_xpu",
-        "test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
-        "test_transformer_offload_false_none_xpu",
-        "test_transformer_offload_false_shard_grad_op_xpu",
-        "test_transformer_offload_true_none_xpu",
-        "test_transformer_offload_true_shard_grad_op_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
-        # fsdp accuracy gaps
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_hooks_multi_traversal_xpu",
-        "test_parity_with_ddp_xpu",
-        "test_parity_with_non_frozen_fsdp_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": (
-        # fsdp accuracy gaps
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True ",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False",
-        "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
@@ -136,53 +39,9 @@
         "test_fsdp_optimizer_overlap",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": (
-        # fsdp accuracy gaps
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_multi_forward_cpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
-        # fsdp accuracy gaps
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_flatten_sharded_optim_state_dict_nested",
-        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False",
-        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True",
-        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_False",
-        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_True",
-        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False",
-        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True",
-        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_False",
-        "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_True",
-        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False",
-        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True",
-        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False",
-        "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True",
-        "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_False",
-        "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_True",
-        "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_False",
-        "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_True",
-        "test_rekey_optim_state_dict_to_names",
-        "test_scatter_full_optim_state_dict_nested_halve_world_size",
-        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False",
-        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True",
-        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False",
-        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True",
-        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False",
-        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True",
-        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False",
-        "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True",
-        "test_shard_full_optim_state_dict_nested_halve_world_size",
-        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False",
-        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True",
-        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False",
-        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True",
-        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False",
-        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True",
-        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False",
-        "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True",
-        "test_use_orig_params",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": None,
     # Performance check, skip
     # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
     #    # fsdp accuracy gaps
@@ -191,60 +50,20 @@
     #    "test_forward_overlap_xpu",
     # ),
     "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": (
-        # fsdp accuracy gaps
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none",
-        "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none",
-        "test_fsdp_ddp_parity_with_grad_scaler_offload_true_none_none_none",
-        "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none",
-    ),
-    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": (
-        # fsdp accuracy gaps
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_state_dict_save_load_flow_state_dict_type_local_state_dict",
-        "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict",
-        "test_state_dict_save_load_flow_state_dict_type_state_dict",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": (
-        # fsdp accuracy gaps
-        # https://github.com/intel/torch-xpu-ops/issues/1504
-        "test_diff_hyperparams_sharding_strategy_str_full_shard",
-        "test_diff_hyperparams_sharding_strategy_str_no_shard",
-        "test_diff_hyperparams_sharding_strategy_str_shard_grad_op",
-        "test_no_sync_correctness",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None,
     "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
     "../../../../test/distributed/fsdp/test_shard_utils.py": None,
     "../../../../test/distributed/fsdp/test_utils.py": None,
     "../../../../test/distributed/fsdp/test_wrap.py": None,
     "../../../../test/distributed/test_backends.py": None,
     "../../../../test/distributed/test_c10d_common.py": None,
-    "../../../../test/distributed/test_c10d_functional_native.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1508
-        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
-        "test_reduce_scatter_tensor_coalesced",
-        "test_reduce_scatter_tensor_single",
-        # # https://github.com/intel/torch-xpu-ops/issues/1525
-        # # ValueError: trying to initialize the default process group twice!
-        # "test_inductor_all_gather_into_tensor_coalesced",
-        # "test_inductor_all_gather_into_tensor_single",
-        # "test_inductor_all_reduce_coalesced",
-        # "test_inductor_all_reduce_non_contig_input",
-        # "test_inductor_all_reduce_single",
-        # "test_inductor_all_to_all_single",
-        # "test_inductor_broadcast",
-        # "test_inductor_inplace_op_on_view",
-        # "test_inductor_reduce_scatter_tensor_coalesced",
-        # "test_inductor_reduce_scatter_tensor_single",
-        # "test_inductor_reuse_buffer_after_inplace_collective",
-        # "test_ranks_and_tag",
-        # "test_wait_tensor",
-    ),
+    "../../../../test/distributed/test_c10d_functional_native.py": None,
     "../../../../test/distributed/test_c10d_logger.py": None,
     "../../../../test/distributed/test_c10d_object_collectives.py": (
         # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal!
@@ -275,6 +94,7 @@
         "test_dynamo_trace_allgather_coalesced",
         "test_inductor_reduce_scatter_coalesced",
         "test_inductor_all_gather_coalesced",
+        "test_reorder_peak_memory",
     ),
     "../../../../test/distributed/test_multi_threaded_pg.py": None,
     "../../../../test/distributed/test_store.py": None,
@@ -386,15 +206,9 @@
     "../../../../test/distributed/tensor/test_common_rules.py": None,
     "../../../../test/distributed/tensor/test_dtensor.py": None,
     "../../../../test/distributed/tensor/test_dtensor_compile.py": None,
-    "../../../../test/distributed/tensor/test_experimental_ops.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1604
-        "test_bernoulli",
-    ),
+    "../../../../test/distributed/tensor/test_experimental_ops.py": None,
     "../../../../test/distributed/tensor/test_init.py": None,
     "../../../../test/distributed/tensor/test_math_ops.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1604, hang
-        "test_mean",
-        "test_nll_loss_and_cross_entropy",
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_cumsum",
         "test_layer_norm_bwd",
@@ -435,10 +249,7 @@
         "test_unused_forward_module",
         "test_unused_forward_output",
     ),
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1661
-        "test_clip_grad_norm_2d",
-    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": (
         # ValueError: Cannot use ReduceOp.PREMUL_SUM with XCCL 
         # https://github.com/intel/torch-xpu-ops/issues/1571, 2.8 skipped
@@ -459,10 +270,7 @@
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None,
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1605
-        "test_fully_shard_training_memory",
-    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": (
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_compute_dtype",
@@ -473,15 +281,9 @@
         # Performance test, should skip
         "test_fully_shard_training_overlap",
     ),
-    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": (
-        # Expected zero exit code but got -9
-        # https://github.com/intel/torch-xpu-ops/issues/1663
-        "test_dp_state_dict_cpu_offload",
-    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None,
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1661
-        "test_train_parity_multi_group_unshard_async_op",
         # checkpointing issue, 2.8 skipped
         "test_train_parity_with_activation_checkpointing",
         # https://jira.devtools.intel.com/browse/MLSL-3625

From 7493676b7da640fa37e19b125fe4504bf85e87a4 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 06:01:16 -0700
Subject: [PATCH 51/83] enable fsdp cases based on local branch

---
 test/xpu/run_distributed_local.py | 63 +++++++++++++++++++++++++++++++
 test/xpu/skip_list_dist_local.py  | 57 ++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 test/xpu/run_distributed_local.py
 create mode 100644 test/xpu/skip_list_dist_local.py

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
new file mode 100644
index 000000000..8074b3292
--- /dev/null
+++ b/test/xpu/run_distributed_local.py
@@ -0,0 +1,63 @@
+import os
+import subprocess
+import sys
+
+from skip_list_dist_local import skip_dict
+from xpu_test_utils import launch_test
+
+res = 0
+fail_test = []
+
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
+if ret == 0:
+    gpu_dict = {}
+    with open("topology.log", "r") as file:
+        lines = file.readlines()
+        for line in lines:
+           if "CPU Affinity" in line:
+              continue
+           line = line.strip()
+           if line.startswith("GPU "):
+               items = line.split(' ')
+               items = [x for x in items if x]
+               gpu_id = items[1]
+               i = gpu_id.split('/')[0]
+               affinity = ""
+               for j, item in enumerate(items):
+                   if "SYS" not in item and ( "XL" in item or "S" in item ):
+                      if len(affinity) == 0:
+                          affinity = str(j-2)
+                      else:
+                          affinity = affinity + ',' + str(j-2)
+               gpu_dict[i] = affinity
+    
+    
+    max_affinity = ""
+    for key, value in gpu_dict.items():
+        if  len(value) > len(max_affinity):
+            max_affinity = value
+    
+    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
+    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
+
+else:
+    print("xpu-smi topology failed")
+    sys.exit(255)
+
+# run pytest with skiplist
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    fail = launch_test(key, skip_list)
+    res += fail
+    if fail:
+        fail_test.append(key)
+
+if fail_test:
+    print(",".join(fail_test) + " have failures")
+
+exit_code = os.WEXITSTATUS(res)
+if exit_code == 0:
+    sys.exit(res)
+else:
+    sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
new file mode 100644
index 000000000..08f90c6b5
--- /dev/null
+++ b/test/xpu/skip_list_dist_local.py
@@ -0,0 +1,57 @@
+skip_dict = {
+    "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1536
+    #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    #    "test_distributed_checkpoint_state_dict_type0_xpu",
+    #    "test_distributed_checkpoint_state_dict_type1_xpu",
+    #),
+    "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
+        "test_delayed_optim_step_offload_true_no_shard_xpu",
+        "test_transformer_no_grad_mixed_precision_True_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
+        "test_fsdp_zero2_eval_with_prefetch",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1537
+    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
+        "test_use_orig_params",
+    ),
+    # Performance check, skip 
+    #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    #    "test_forward_overlap",
+    #    "test_forward_overlap_xpu",
+    #),
+    "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None,
+    "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_shard_utils.py": None,
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+    "../../../../test/distributed/fsdp/test_wrap.py": None,
+}

From 7d5e6a90c6c61027267ec4ede3c79da753a66420 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 19:46:24 -0700
Subject: [PATCH 52/83] add 2025.0 WA

---
 test/xpu/run_distributed_local.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 8074b3292..b6a9ef60c 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -8,6 +8,9 @@
 res = 0
 fail_test = []
 
+os.environ["CCL_ATL_TRANSPORT"] = "ofi"
+os.environ["CCL_SEND"] = "direct"
+os.environ["CCL_RECV"] = "direct" 
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:

From 565d86adbad02c52c6168a2f6aa47d0af19fc87a Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 01:27:44 -0700
Subject: [PATCH 53/83] Update distributed UT cases in DDP and PP

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 29 +++++++++-
 test/xpu/skip_list_dist_local.py  | 91 +++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index b6a9ef60c..982f05409 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -2,15 +2,17 @@
 import subprocess
 import sys
 
-from skip_list_dist_local import skip_dict
+from skip_list_dist_local import skip_dict, skip_dict_python
 from xpu_test_utils import launch_test
 
 res = 0
+res2 = 0
 fail_test = []
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
 os.environ["CCL_RECV"] = "direct" 
+os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:
@@ -48,6 +50,29 @@
     print("xpu-smi topology failed")
     sys.exit(255)
 
+# run python test
+def run(test_command):
+    result = subprocess.run(test_command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    return result
+
+for key in skip_dict_python:
+    skip_list = skip_dict_python[key]
+    test_command = ["python", key]
+    fail = run(test_command)
+    if fail.returncode:
+        for line in fail.stderr.split("\n"):
+            if "FAIL: " in line:
+                is_error = True
+                for skip_case in skip_list:
+                    if skip_case in line:
+                        print("Skiped error: ", key + " " + skip_case)
+                        is_error = False
+                if is_error:
+                    res2 += fail.returncode
+                    fail_test.append("".join(key + " " + line))
+
 # run pytest with skiplist
 for key in skip_dict:
     skip_list = skip_dict[key]
@@ -61,6 +86,6 @@
 
 exit_code = os.WEXITSTATUS(res)
 if exit_code == 0:
-    sys.exit(res)
+    sys.exit(res2)
 else:
     sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 08f90c6b5..d65b7aee6 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -54,4 +54,95 @@
     "../../../../test/distributed/fsdp/test_shard_utils.py": None,
     "../../../../test/distributed/fsdp/test_utils.py": None,
     "../../../../test/distributed/fsdp/test_wrap.py": None,
+    "../../../../test/distributed/test_backends.py": None,
+    "../../../../test/distributed/test_c10d_common.py": None,
+    "../../../../test/distributed/test_c10d_functional_native.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        "test_reduce_scatter_tensor_coalesced",
+        "test_reduce_scatter_tensor_single",
+        # https://github.com/intel/torch-xpu-ops/issues/1525
+        # ValueError: trying to initialize the default process group twice!
+        "test_inductor_all_gather_into_tensor_coalesced",
+        "test_inductor_all_gather_into_tensor_single",
+        "test_inductor_all_reduce_coalesced",
+        "test_inductor_all_reduce_non_contig_input",
+        "test_inductor_all_reduce_single",
+        "test_inductor_all_to_all_single",
+        "test_inductor_broadcast",
+        "test_inductor_inplace_op_on_view",
+        "test_inductor_reduce_scatter_tensor_coalesced",
+        "test_inductor_reduce_scatter_tensor_single",
+        "test_inductor_reuse_buffer_after_inplace_collective",
+        "test_ranks_and_tag",
+        "test_wait_tensor",
+    ),
+    "../../../../test/distributed/test_c10d_logger.py": None,
+    "../../../../test/distributed/test_c10d_object_collectives.py": (
+        # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds
+        # https://github.com/intel/torch-xpu-ops/issues/1535
+        "test_gather_object_cpu",
+        "test_gather_object_xpu",
+        "test_gather_object_list_cpu",
+        "test_gather_object_list_xpu",
+    ),
+    "../../../../test/distributed/test_compute_comm_reordering.py": None,
+    "../../../../test/distributed/test_control_collectives.py": None,
+    "../../../../test/distributed/test_device_mesh.py": None,
+    "../../../../test/distributed/test_dynamo_distributed.py": (
+        # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout'
+        "test_asymmetric_compilation",
+        "test_asymmetric_compilation_with_fx_cache",
+        # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
+        "test_compiled_flex_attention_full_model_ddp",
+        "test_compiled_flex_attention_local_ddp",
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ 
+        # https://github.com/intel/torch-xpu-ops/issues/1527
+        "test_compiler_collectives_automatic_dynamic_scalar",
+        "test_compiler_collectives_automatic_dynamic_speculation_divergence",
+        "test_compiler_collectives_automatic_dynamic_tensor",
+        "test_compiler_collectives_dim_mismatch",
+        "test_compiler_collectives_graph_break_empty_graph_still_collective",
+        "test_compiler_collectives_missing_source",
+        "test_compiler_collectives_scalar_missing_source",
+        "test_compiler_collectives_type_mismatch",
+        "test_ddp_activation_checkpointing",
+        "test_ddp_baseline_aot_eager_multiprocess",
+        "test_fsdp_activation_checkpointing",
+        "test_fsdp_aot_eager",
+        "test_fsdp_inductor",
+        "test_fsdp_setattr",
+        "test_fsdp_unspecialized_forced_getattr_inline",
+        "test_fsdp_unspecialized_forced_getattr_no_inline",
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_get_pg_attr",
+    ),
+    "../../../../test/distributed/test_fake_pg.py": None,
+    "../../../../test/distributed/test_functional_api.py": (
+        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
+        # https://github.com/intel/torch-xpu-ops/issues/1526
+        "test_tracing_xpu",
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu"
+    ),
+    "../../../../test/distributed/test_multi_threaded_pg.py": (
+        # oneccl not support multi-threaded well, so skip it first.
+        "test_bwd_sees_fwd_pg",
+    ),
+    "../../../../test/distributed/test_store.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    "../../../../test/distributed/pipelining/test_microbatch.py": None,
+    "../../../../test/distributed/pipelining/test_pipe.py": None,
+    "../../../../test/distributed/pipelining/test_schedule.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
+    "../../../../test/distributed/pipelining/test_unflatten.py": None,
+}
+
+skip_dict_python = {
+    "distributed/test_c10d_ops_xccl.py": None,
+    "distributed/test_c10d_xccl.py": None,
+    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error.
+    "../../../../test/distributed/pipelining/test_stage.py": None,
 }

From 9d0ddfe82f912947d0cc0e95a0f1d640c04f80c0 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 01:36:16 -0700
Subject: [PATCH 54/83] Fixed pylint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 7 +++----
 test/xpu/skip_list_dist_local.py  | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 982f05409..a5f0c8098 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -11,7 +11,7 @@
 
 os.environ["CCL_ATL_TRANSPORT"] = "ofi"
 os.environ["CCL_SEND"] = "direct"
-os.environ["CCL_RECV"] = "direct" 
+os.environ["CCL_RECV"] = "direct"
 os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
@@ -36,13 +36,12 @@
                       else:
                           affinity = affinity + ',' + str(j-2)
                gpu_dict[i] = affinity
-    
-    
+
     max_affinity = ""
     for key, value in gpu_dict.items():
         if  len(value) > len(max_affinity):
             max_affinity = value
-    
+
     os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
     print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
 
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index d65b7aee6..6ce62b8ca 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -96,7 +96,7 @@
         # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
         "test_compiled_flex_attention_full_model_ddp",
         "test_compiled_flex_attention_local_ddp",
-        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ 
+        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__
         # https://github.com/intel/torch-xpu-ops/issues/1527
         "test_compiler_collectives_automatic_dynamic_scalar",
         "test_compiler_collectives_automatic_dynamic_speculation_divergence",
@@ -131,13 +131,13 @@
     ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
     "../../../../test/distributed/pipelining/test_unflatten.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
 }
 
 skip_dict_python = {

From 9cf12d50b27aecd1275717613c9a25d33d105029 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 3 Apr 2025 02:01:55 -0700
Subject: [PATCH 55/83] Fixed pylint error

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/run_distributed_local.py | 38 ++++++++++++++++---------------
 test/xpu/skip_list_dist_local.py  | 17 +++++++-------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index a5f0c8098..d4db4785a 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -17,29 +17,29 @@
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
 if ret == 0:
     gpu_dict = {}
-    with open("topology.log", "r") as file:
+    with open("topology.log") as file:
         lines = file.readlines()
         for line in lines:
-           if "CPU Affinity" in line:
-              continue
-           line = line.strip()
-           if line.startswith("GPU "):
-               items = line.split(' ')
-               items = [x for x in items if x]
-               gpu_id = items[1]
-               i = gpu_id.split('/')[0]
-               affinity = ""
-               for j, item in enumerate(items):
-                   if "SYS" not in item and ( "XL" in item or "S" in item ):
-                      if len(affinity) == 0:
-                          affinity = str(j-2)
-                      else:
-                          affinity = affinity + ',' + str(j-2)
-               gpu_dict[i] = affinity
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
 
     max_affinity = ""
     for key, value in gpu_dict.items():
-        if  len(value) > len(max_affinity):
+        if len(value) > len(max_affinity):
             max_affinity = value
 
     os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
@@ -49,6 +49,7 @@
     print("xpu-smi topology failed")
     sys.exit(255)
 
+
 # run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
@@ -56,6 +57,7 @@ def run(test_command):
     print(result.stderr)
     return result
 
+
 for key in skip_dict_python:
     skip_list = skip_dict_python[key]
     test_command = ["python", key]
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 6ce62b8ca..0ac46961e 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -1,10 +1,10 @@
 skip_dict = {
     "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
     # https://github.com/intel/torch-xpu-ops/issues/1536
-    #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
     #    "test_distributed_checkpoint_state_dict_type0_xpu",
     #    "test_distributed_checkpoint_state_dict_type1_xpu",
-    #),
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
@@ -37,11 +37,11 @@
     "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
         "test_use_orig_params",
     ),
-    # Performance check, skip 
-    #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    # Performance check, skip
+    # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
     #    "test_forward_overlap",
     #    "test_forward_overlap_xpu",
-    #),
+    # ),
     "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
@@ -58,7 +58,7 @@
     "../../../../test/distributed/test_c10d_common.py": None,
     "../../../../test/distributed/test_c10d_functional_native.py": (
         # https://github.com/intel/torch-xpu-ops/issues/1508
-        #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
         "test_reduce_scatter_tensor_coalesced",
         "test_reduce_scatter_tensor_single",
         # https://github.com/intel/torch-xpu-ops/issues/1525
@@ -123,7 +123,7 @@
         # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
         # https://github.com/intel/torch-xpu-ops/issues/1526
         "test_tracing_xpu",
-        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu"
+        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu",
     ),
     "../../../../test/distributed/test_multi_threaded_pg.py": (
         # oneccl not support multi-threaded well, so skip it first.
@@ -131,7 +131,6 @@
     ),
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
-    "../../../../test/distributed/pipelining/test_backward.py": None,
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
@@ -143,6 +142,6 @@
 skip_dict_python = {
     "distributed/test_c10d_ops_xccl.py": None,
     "distributed/test_c10d_xccl.py": None,
-    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error.
+    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
 }

From 8705ec54115e50d79d1abf719ba17a54290004ba Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:25:50 +0000
Subject: [PATCH 56/83] add distributed ut in CI

---
 .github/scripts/ut_result_check.sh |  17 ++--
 .github/workflows/_linux_build.yml |  30 ++++++-
 .github/workflows/_linux_ut.yml    | 140 +++++++++++++++++++++++++++++
 .github/workflows/pull.yml         |  25 ++++++
 4 files changed, 202 insertions(+), 10 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index a666bfa9a..260672ce0 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -134,19 +134,20 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
-if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
-    grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
-    grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_xpu_distributed_test_failed.log
-    compare_and_filter_logs "${ut_suite}"_xpu_distributed_test_failed.log Known_issue.log
-    if [[ -f "${ut_suite}_xpu_distributed_test_failed_filtered.log" ]]; then
-      num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed_filtered.log")
+
+if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
+    grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log
+    compare_and_filter_logs "${ut_suite}"_test_failed.log Known_issue.log
+    if [[ -f "${ut_suite}_test_failed_filtered.log" ]]; then
+      num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed_filtered.log")
     else
-      num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log")
+      num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     fi
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite} xpu distributed"
     echo -e "========================================================================="
-    cat "./${ut_suite}_xpu_distributed_test_failed.log"
+    cat "./${ut_suite}_test_failed.log"
     ((num_failed=num_failed_xpu_distributed))
     if [[ $num_failed -gt 0 ]]; then
       echo -e "[ERROR] UT ${ut_suite} test Fail"
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index f7381a502..69a843e35 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -74,8 +74,34 @@ jobs:
           pip install -U pip wheel setuptools
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
-        with:
-          path: torch-xpu-ops
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_build || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_build
+          conda create -n xpu_build python=${{ inputs.python }} cmake=3.28 ninja -y
+          source activate xpu_build
+          cd ../ && rm -rf pytorch
+          pip install requests
+          if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
+            git clone https://github.com/daisyden/pytorch.git pytorch
+          else
+            git clone https://github.com/pytorch/pytorch pytorch
+          fi
+          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+          # apply PRs for stock pytorch
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
+          else
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+          fi
+>>>>>>> 62e9ff75 (add distributed ut in CI)
       - name: Build Pytorch XPU
         run: |
           set -xe
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 7f29d89d3..2fab9fdb1 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -461,3 +461,143 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed
           path: ${{ github.workspace }}/ut_log
+
+  pytorch_distributed_test:
+    runs-on: ${{ inputs.runner }}
+    if: contains(inputs.ut, 'pytorch_distributed')
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/daisyden/pytorch.git pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Triton Installation
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          if [ -z ${{ inputs.triton }} ]; then
+            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
+          else
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          fi
+          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
+          if [[ ${{ inputs.abi }} == '0' ]]; then
+            export _GLIBCXX_USE_CXX11_ABI=0
+          else
+            export _GLIBCXX_USE_CXX11_ABI=1
+          fi
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
+            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Distributed UT
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install pytest
+          cd ${{ github.workspace }}
+          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
+          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
+          mkdir -p ut_log/pytorch_distributed
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
+            echo -e "[ERROR] XCCL is not enabled"
+            exit 1
+          fi
+          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          cd ${{ github.workspace }}
+          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: UT Test Results Check
+        shell: bash
+        run: |
+          function contains() {
+              contains_status="echo 'Start $2 ...'"
+              {
+                [[ $1 =~ (^|,)$2($|,) ]]
+              } || {
+                echo "[Warning] $2 is not suppotted type! Skipped!"
+                contains_status="continue"
+              }
+          }
+          set -xe
+          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          cd ${{ github.workspace }}/ut_log/pytorch_distributed
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          bash ut_result_check.sh 'pytorch_distributed'
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 02cd96b5b..24f9e2b1e 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -66,6 +66,31 @@ jobs:
       pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
       ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed
       runner: linux.idc.xpu
+  
+  preci-linux-build-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-lint-check
+    permissions:
+      issues: write
+    uses: ./.github/workflows/_linux_build.yml
+    with:
+      pytorch: distributed_2.8
+      runner: pvc_e2e
+
+  preci-ut-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-linux-build-distributed
+    uses: ./.github/workflows/_linux_ut.yml
+    with:
+      pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
+      ut: pytorch_distributed
+      runner: pvc_e2e
 
   Inductor-XPU-E2E-CI-Tests:
     name: preci-linux / e2e_test

From 4b94ee22404a692a3bd8016d7ede32dbf186cbf8 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:52:17 +0000
Subject: [PATCH 57/83] update if condition

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++++------------
 .github/workflows/_linux_ut.yml    |  2 +-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 69a843e35..d8c7eb2c5 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -86,20 +86,22 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-          fi
-          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-          # apply PRs for stock pytorch
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git status && git show -s
-          git submodule sync && git submodule update --init --recursive
-          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-            echo "Don't replace torch-xpu-ops!"
-          else
-            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-            # Workaround for torch-xpu-ops ci test
-            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
           fi
 >>>>>>> 62e9ff75 (add distributed ut in CI)
       - name: Build Pytorch XPU
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 2fab9fdb1..12cf823f5 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -44,7 +44,7 @@ permissions: read-all
 jobs:
   ut_test:
     runs-on: ${{ inputs.runner }} 
-    if: ${{ inputs.ut != 'xpu_distributed' }}
+    if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }}
     timeout-minutes: 900
     env:
       GH_TOKEN: ${{ github.token }}

From e52ae48b505a553c7f44fa8814a7e7fe31a4060d Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 23:02:20 +0000
Subject: [PATCH 58/83] keep_torch_xpu_ops

---
 .github/workflows/pull.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 24f9e2b1e..ee0caaac8 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -78,6 +78,7 @@ jobs:
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: distributed_2.8
+      keep_torch_xpu_ops: true
       runner: pvc_e2e
 
   preci-ut-distributed:

From 0fc4430b7b512309b7c08109476789cba906a679 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 13:37:10 +0000
Subject: [PATCH 59/83] update pytorch build

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index d8c7eb2c5..69a843e35 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -86,22 +86,20 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            # apply PRs for stock pytorch
-            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-            git status && git show -s
-            git submodule sync && git submodule update --init --recursive
-            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-              echo "Don't replace torch-xpu-ops!"
-            else
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-              # Workaround for torch-xpu-ops ci test
-              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-            fi
+          fi
+          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+          # apply PRs for stock pytorch
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
+          else
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
           fi
 >>>>>>> 62e9ff75 (add distributed ut in CI)
       - name: Build Pytorch XPU

From 45dfc65ece6d109e6b6ac0299975d0931c1afc4b Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Wed, 9 Apr 2025 00:18:27 -0700
Subject: [PATCH 60/83] Enabled UT for test/distributed/tensor

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 79 ++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 0ac46961e..42cdebf19 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -136,7 +136,85 @@
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
     "../../../../test/distributed/pipelining/test_unflatten.py": None,
+    "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": (
+        # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter'
+        # is not currently implemented for the XPU device
+        # https://github.com/intel/torch-xpu-ops/issues/1547
+        "test_dtensor_seq_par_shard_dim_0",
+        "test_dtensor_seq_par_shard_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
+        "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
+        # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......'
+        # https://github.com/intel/torch-xpu-ops/issues/1548
+        "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True",
+        # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......'
+        # https://github.com/intel/torch-xpu-ops/issues/1549
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True",
+        # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device.
+        # https://github.com/intel/torch-xpu-ops/issues/1550
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True",
+        # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter'
+        # is not currently implemented for the XPU device.
+        # https://github.com/intel/torch-xpu-ops/issues/1551
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2",
+    ),
+    "../../../../test/distributed/tensor/parallel/test_tp_examples.py": (
+        # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!
+        # https://github.com/intel/torch-xpu-ops/issues/1555
+        "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings",
+        "test_transformer_training_is_seq_parallel_False_float32",
+        "test_transformer_training_is_seq_parallel_True_float32",
+        # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered.
+        # https://github.com/intel/torch-xpu-ops/issues/1556
+        "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output",
+    ),
     "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None,
+    "../../../../test/distributed/tensor/parallel/test_tp_style.py": None,
+    "../../../../test/distributed/tensor/test_api.py": None,
+    "../../../../test/distributed/tensor/test_attention.py": None,
+    "../../../../test/distributed/tensor/test_common_rules.py": None,
+    "../../../../test/distributed/tensor/test_dtensor.py": None,
+    "../../../../test/distributed/tensor/test_dtensor_compile.py": None,
+    "../../../../test/distributed/tensor/test_experimental_ops.py": None,
+    "../../../../test/distributed/tensor/test_init.py": None,
+    "../../../../test/distributed/tensor/test_math_ops.py": (
+        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+        # https://github.com/intel/torch-xpu-ops/issues/1508
+        "test_mean",
+        "test_nll_loss_and_cross_entropy",
+    ),
+    "../../../../test/distributed/tensor/test_random_ops.py": None,
+    "../../../../test/distributed/tensor/test_redistribute.py": None,
+    "../../../../test/distributed/tensor/test_tensor_ops.py": None,
+    "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None,
 }
 
 skip_dict_python = {
@@ -144,4 +222,5 @@
     "distributed/test_c10d_xccl.py": None,
     "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
 }

From ebbce64a2d99ed92e2142cbe18c6a8643a0e56a2 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 2 Apr 2025 06:01:16 -0700
Subject: [PATCH 61/83] enable fsdp cases based on local branch

---
 test/xpu/run_distributed_local.py |  75 ++++++--
 test/xpu/skip_list_dist_local.py  | 301 +++++++++++++++++++++---------
 2 files changed, 269 insertions(+), 107 deletions(-)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index d4db4785a..dcf079992 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -3,15 +3,12 @@
 import sys
 
 from skip_list_dist_local import skip_dict, skip_dict_python
-from xpu_test_utils import launch_test
 
 res = 0
 res2 = 0
 fail_test = []
+error_log = ""
 
-os.environ["CCL_ATL_TRANSPORT"] = "ofi"
-os.environ["CCL_SEND"] = "direct"
-os.environ["CCL_RECV"] = "direct"
 os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
@@ -50,6 +47,9 @@
     sys.exit(255)
 
 
+from xpu_test_utils import launch_test
+
+
 # run python test
 def run(test_command):
     result = subprocess.run(test_command, capture_output=True, text=True)
@@ -59,20 +59,63 @@ def run(test_command):
 
 
 for key in skip_dict_python:
-    skip_list = skip_dict_python[key]
+    skip_list = skip_dict_python[key] if skip_dict_python[key] else []
     test_command = ["python", key]
     fail = run(test_command)
+    num_skipped = 0
+    num_err = 0
     if fail.returncode:
-        for line in fail.stderr.split("\n"):
-            if "FAIL: " in line:
-                is_error = True
-                for skip_case in skip_list:
-                    if skip_case in line:
-                        print("Skiped error: ", key + " " + skip_case)
-                        is_error = False
-                if is_error:
-                    res2 += fail.returncode
-                    fail_test.append("".join(key + " " + line))
+        for i, err in enumerate(fail.stderr.split("FAIL: ")):
+            if i == 0 and len(err) > 0:
+                error_log += err
+                continue
+            is_skipped = False
+            for skip_case in skip_list:
+                if skip_case in err:
+                    print("Skipped error: ", key + " " + skip_case)
+                    num_skipped += 1
+                    is_skipped = True
+                    break
+            if not is_skipped:
+                num_err += 1
+                res2 += fail.returncode
+                if i == len(fail.stderr.split("FAIL: ")) - 1:
+                    error_log += "FAIL: "
+                    for line in err.split("\n"):
+                        if line.startswith("FAILED (failures="):
+                            num_errs = line.split("=")[1].split(")")[0].strip()
+                            error_log += (
+                                "FAILED (failures="
+                                + str(int(num_errs) - num_skipped)
+                                + f" skipped {num_skipped} cases"
+                                + ")\n"
+                            )
+                        else:
+                            error_log += line + "\n"
+                else:
+                    error_log += "FAIL: " + err
+            else:
+                if i == len(fail.stderr.split("FAIL: ")) - 1:
+                    error_log += "FAIL: "
+                    for line in err.split("\n"):
+                        if line.startswith("FAILED (failures="):
+                            num_errs = line.split("=")[1].split(")")[0].strip()
+                            error_log += (
+                                "FAILED (failures="
+                                + str(int(num_errs) - num_skipped)
+                                + f" skipped {num_skipped} cases"
+                                + ")\n"
+                            )
+
+    renamed_key = key.replace("../../../../", "").replace("/", "_")
+    if num_err > 0:
+        fail_test.append(key)
+        with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
+            f.write(error_log)
+    else:
+        with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
+            f.write(fail.stdout)
+            f.write(fail.stderr)
 
 # run pytest with skiplist
 for key in skip_dict:
@@ -89,4 +132,4 @@ def run(test_command):
 if exit_code == 0:
     sys.exit(res2)
 else:
-    sys.exit(exit_code)
+    sys.exit(exit_code)
\ No newline at end of file
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 42cdebf19..58c73eed2 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -7,14 +7,20 @@
     # ),
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
+        # Accuracy gap in FSDP checkpoint related UT
+        # https://github.com/intel/torch-xpu-ops/issues/1666, 2.8 skipped
+        "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_submodule_use_reentrant_False_xpu",
+    ),
     "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
-        "test_delayed_optim_step_offload_true_no_shard_xpu",
-        "test_transformer_no_grad_mixed_precision_True_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
@@ -28,17 +34,18 @@
     "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
-        "test_fsdp_zero2_eval_with_prefetch",
+        # fsdp accuracy gaps
+        # https://github.com/intel/torch-xpu-ops/issues/1504, Performance test, should skip
+        "test_fsdp_optimizer_overlap",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
-    # https://github.com/intel/torch-xpu-ops/issues/1537
-    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": (
-        "test_use_orig_params",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": None,
     # Performance check, skip
     # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    #    # fsdp accuracy gaps
+        # https://github.com/intel/torch-xpu-ops/issues/1504
     #    "test_forward_overlap",
     #    "test_forward_overlap_xpu",
     # ),
@@ -56,82 +63,55 @@
     "../../../../test/distributed/fsdp/test_wrap.py": None,
     "../../../../test/distributed/test_backends.py": None,
     "../../../../test/distributed/test_c10d_common.py": None,
-    "../../../../test/distributed/test_c10d_functional_native.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/1508
-        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
-        "test_reduce_scatter_tensor_coalesced",
-        "test_reduce_scatter_tensor_single",
-        # https://github.com/intel/torch-xpu-ops/issues/1525
-        # ValueError: trying to initialize the default process group twice!
-        "test_inductor_all_gather_into_tensor_coalesced",
-        "test_inductor_all_gather_into_tensor_single",
-        "test_inductor_all_reduce_coalesced",
-        "test_inductor_all_reduce_non_contig_input",
-        "test_inductor_all_reduce_single",
-        "test_inductor_all_to_all_single",
-        "test_inductor_broadcast",
-        "test_inductor_inplace_op_on_view",
-        "test_inductor_reduce_scatter_tensor_coalesced",
-        "test_inductor_reduce_scatter_tensor_single",
-        "test_inductor_reuse_buffer_after_inplace_collective",
-        "test_ranks_and_tag",
-        "test_wait_tensor",
-    ),
+    "../../../../test/distributed/test_c10d_functional_native.py": None,
     "../../../../test/distributed/test_c10d_logger.py": None,
     "../../../../test/distributed/test_c10d_object_collectives.py": (
-        # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds
-        # https://github.com/intel/torch-xpu-ops/issues/1535
-        "test_gather_object_cpu",
-        "test_gather_object_xpu",
-        "test_gather_object_list_cpu",
-        "test_gather_object_list_xpu",
+        # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal!
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_scatter_object_list_cpu",
+        "test_scatter_object_list_xpu",
     ),
     "../../../../test/distributed/test_compute_comm_reordering.py": None,
     "../../../../test/distributed/test_control_collectives.py": None,
     "../../../../test/distributed/test_device_mesh.py": None,
     "../../../../test/distributed/test_dynamo_distributed.py": (
-        # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout'
-        "test_asymmetric_compilation",
-        "test_asymmetric_compilation_with_fx_cache",
-        # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device.
-        "test_compiled_flex_attention_full_model_ddp",
-        "test_compiled_flex_attention_local_ddp",
-        # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__
-        # https://github.com/intel/torch-xpu-ops/issues/1527
-        "test_compiler_collectives_automatic_dynamic_scalar",
-        "test_compiler_collectives_automatic_dynamic_speculation_divergence",
-        "test_compiler_collectives_automatic_dynamic_tensor",
-        "test_compiler_collectives_dim_mismatch",
-        "test_compiler_collectives_graph_break_empty_graph_still_collective",
-        "test_compiler_collectives_missing_source",
-        "test_compiler_collectives_scalar_missing_source",
-        "test_compiler_collectives_type_mismatch",
-        "test_ddp_activation_checkpointing",
-        "test_ddp_baseline_aot_eager_multiprocess",
-        "test_fsdp_activation_checkpointing",
-        "test_fsdp_aot_eager",
-        "test_fsdp_inductor",
+        # AssertionError: 'setattr() on Tensor.requires_grad' not found in 'Attempted to call function marked as skipped
+        # https://github.com/intel/torch-xpu-ops/issues/1667, 2.8 skipped
         "test_fsdp_setattr",
-        "test_fsdp_unspecialized_forced_getattr_inline",
-        "test_fsdp_unspecialized_forced_getattr_no_inline",
-        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
-        # https://github.com/intel/torch-xpu-ops/issues/1526
-        "test_get_pg_attr",
     ),
     "../../../../test/distributed/test_fake_pg.py": None,
-    "../../../../test/distributed/test_functional_api.py": (
-        # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
-        # https://github.com/intel/torch-xpu-ops/issues/1526
-        "test_tracing_xpu",
-        "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu",
-    ),
-    "../../../../test/distributed/test_multi_threaded_pg.py": (
-        # oneccl not support multi-threaded well, so skip it first.
-        "test_bwd_sees_fwd_pg",
+    "../../../../test/distributed/test_functional_api.py": None,
+    "../../../../test/distributed/test_inductor_collectives.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1581, 2.8 skipped
+        # Fatal Python error: Segmentation fault
+        "test_dynamo_rewrite_dist_all_gather",
+        "test_dynamo_rewrite_dist_all_gather_list",
+        "test_dynamo_rewrite_dist_all_gather_args_match",
+        "test_dynamo_rewrite_dist_reduce_scatter",
+        "test_dynamo_support_collective_op_with_async_op_False",
+        "test_dynamo_trace_reduce_scatter_tensor",
+        "test_dynamo_trace_all_gather_tensor",
+        "test_dynamo_trace_allgather_coalesced",
+        "test_inductor_reduce_scatter_coalesced",
+        "test_inductor_all_gather_coalesced",
+        "test_reorder_peak_memory",
     ),
+    "../../../../test/distributed/test_multi_threaded_pg.py": None,
     "../../../../test/distributed/test_store.py": None,
     "../../../../test/distributed/pipelining/test_backward.py": None,
+    # (
+    #     # fsdp accuracy gaps
+          # https://github.com/intel/torch-xpu-ops/issues/1504
+    #     "test_stage_backward_weight_multiple_iters_xpu",
+    #     "test_stage_backward_weight_xpu",
+    #     "test_stage_backward_xpu",
+    # ),
     "../../../../test/distributed/pipelining/test_microbatch.py": None,
+    # (
+    #     # fsdp accuracy gaps
+          # https://github.com/intel/torch-xpu-ops/issues/1504, need retest with oneccl fix
+    #     "test_chunk_spec_xpu",
+    # ),
     "../../../../test/distributed/pipelining/test_pipe.py": None,
     "../../../../test/distributed/pipelining/test_schedule.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
@@ -139,7 +119,7 @@
     "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": (
         # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter'
         # is not currently implemented for the XPU device
-        # https://github.com/intel/torch-xpu-ops/issues/1547
+        # https://github.com/intel/torch-xpu-ops/issues/1547, 2.8 skipped
         "test_dtensor_seq_par_shard_dim_0",
         "test_dtensor_seq_par_shard_dim_1",
         "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
@@ -148,7 +128,7 @@
         "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
         "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
         # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......'
-        # https://github.com/intel/torch-xpu-ops/issues/1548
+        # https://github.com/intel/torch-xpu-ops/issues/1548, 2.8 skipped
         "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False",
         "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True",
         "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False",
@@ -156,7 +136,7 @@
         "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False",
         "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True",
         # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......'
-        # https://github.com/intel/torch-xpu-ops/issues/1549
+        # https://github.com/intel/torch-xpu-ops/issues/1549, 2.8 skipped
         "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False",
         "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True",
         "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False",
@@ -164,14 +144,14 @@
         "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False",
         "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True",
         # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device.
-        # https://github.com/intel/torch-xpu-ops/issues/1550
+        # https://github.com/intel/torch-xpu-ops/issues/1550, 2.8 skipped
         "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False",
         "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True",
         "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False",
         "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True",
         # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter'
         # is not currently implemented for the XPU device.
-        # https://github.com/intel/torch-xpu-ops/issues/1551
+        # https://github.com/intel/torch-xpu-ops/issues/1551, 2.8 skipped
         "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
         "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
         "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
@@ -183,8 +163,8 @@
     ),
     "../../../../test/distributed/tensor/parallel/test_tp_examples.py": (
         # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!
-        # https://github.com/intel/torch-xpu-ops/issues/1555
-        "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all",
+        # https://github.com/intel/torch-xpu-ops/issues/1555, 2.8 skipped
+        "test_transformer_req_grad_seq_parallel_float32_thaw_all",
         "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings",
         "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings",
         "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings",
@@ -192,13 +172,36 @@
         "test_transformer_training_is_seq_parallel_False_float32",
         "test_transformer_training_is_seq_parallel_True_float32",
         # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered.
-        # https://github.com/intel/torch-xpu-ops/issues/1556
+        # https://github.com/intel/torch-xpu-ops/issues/1556, 2.8 skipped
         "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output",
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_loss_parallel",
+        "test_mlp_training_is_seq_parallel_False_recompute_activation_False",
+        "test_mlp_training_is_seq_parallel_True_recompute_activation_False",
+        "test_transformer_req_grad_float64_thaw_all",
+        "test_transformer_training_is_seq_parallel_False_float64",
+        "test_transformer_training_is_seq_parallel_True_float64",
+        "test_sequence_parallel_style",
     ),
     "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
-    "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None,
+    "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_linear_col_wise_parallel",
+        "test_parallelize_mlp_with_module_api",
+        "test_parallelize_mlp_with_module_api_nested",
+        "test_parallelize_module_multi_wildcard",
+        "test_parallelize_module_src_data_rank",
+        "test_parallelize_module_with_digit",
+        "test_parallelize_module_with_question",
+        "test_parallelize_module_with_star",
+        "test_under_devicemesh_context",
+    ),
     "../../../../test/distributed/tensor/parallel/test_tp_style.py": None,
-    "../../../../test/distributed/tensor/test_api.py": None,
+    "../../../../test/distributed/tensor/test_api.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_distribute_tensor_rank",
+        "test_distribute_tensor_uneven_sharding",
+    ),
     "../../../../test/distributed/tensor/test_attention.py": None,
     "../../../../test/distributed/tensor/test_common_rules.py": None,
     "../../../../test/distributed/tensor/test_dtensor.py": None,
@@ -206,21 +209,137 @@
     "../../../../test/distributed/tensor/test_experimental_ops.py": None,
     "../../../../test/distributed/tensor/test_init.py": None,
     "../../../../test/distributed/tensor/test_math_ops.py": (
-        # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
-        # https://github.com/intel/torch-xpu-ops/issues/1508
-        "test_mean",
-        "test_nll_loss_and_cross_entropy",
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_cumsum",
+        "test_layer_norm_bwd",
+        "test_layer_norm_bwd_req_grad",
+        "test_layer_norm_fwd",
+        "test_linear_op_reductions",
+        "test_shard0_svd",
+        "test_softmax_fwd",
+        "test_topk",
     ),
     "../../../../test/distributed/tensor/test_random_ops.py": None,
-    "../../../../test/distributed/tensor/test_redistribute.py": None,
-    "../../../../test/distributed/tensor/test_tensor_ops.py": None,
-    "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None,
+    "../../../../test/distributed/tensor/test_redistribute.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_redistribute_shard_dim_change",
+        "test_redistribute_uneven_sharding",
+        "test_shard_to_replicate_forward_backward",
+        "test_shard_to_replicate_forward_backward_datatype_conversion",
+        "test_multi_dim_mesh",
+    ),
+    "../../../../test/distributed/tensor/test_tensor_ops.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_aten_contiguous",
+        "test_gather",
+        "test_index",
+        "test_slice",
+        "test_stack",
+        "test_where_type_promotion",
+    ),
+    "../../../../test/distributed/tensor/experimental/test_register_sharding.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_argmax",
+        "test_softmax_fwd",
+    ),
+    # FSDP2
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_nontensor_activations",
+        "test_unused_forward_module",
+        "test_unused_forward_output",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": (
+        # ValueError: Cannot use ReduceOp.PREMUL_SUM with XCCL 
+        # https://github.com/intel/torch-xpu-ops/issues/1571, 2.8 skipped
+        "test_set_reduce_scatter_divide_factor",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": (
+        # torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised
+        # https://github.com/intel/torch-xpu-ops/issues/1665, 2.8 skipped
+        "test_transformer_backend_inductor_fullgraph_True",
+        "test_nested_fully_shard_backend_inductor_fullgraph_True",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_train_mixed_requires_grad_per_group",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_compute_dtype",
+        "test_grad_acc_with_reduce_dtype",
+        "test_reduce_dtype",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": (
+        # Performance test, should skip
+        "test_fully_shard_training_overlap",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": (
+        # checkpointing issue, 2.8 skipped
+        "test_train_parity_with_activation_checkpointing",
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_1f1b_microbatching",
+        "test_gradient_accumulation",
+    ),
+    "../../../../test/distributed/_composable/test_replicate_with_compiler.py": (
+        # AssertionError: Tensor-likes are not close!
+        # https://github.com/intel/torch-xpu-ops/issues/1668, 2.8 skipped
+        "test_compile_backward_only",
+        "test_compile_bf16",
+        "test_compile_fp16",
+        "test_compile_gpu",
+        "test_compile_gpu_ac",
+    ),
+    "../../../../test/distributed/_shard/test_sharder.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_custom_sharder",
+    ),
+    "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None,
+    "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": {
+        # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
+        # https://github.com/intel/torch-xpu-ops/issues/1617, 2.8 skipped
+        "test_complete_world_size",
+        "test_multiple_local_shards",
+        "test_new_group",
+        "test_partial_world_size",
+        "test_grid_sharding",
+        "test_multiple_local_shards",
+        "test_new_group",
+        "test_partial_world_size",
+        "test_with_rpc_names",
+        "test_init_from_local_tensor",
+        # what():  Attempting to send a Tensor with unexpected device type xpu:3
+        # https://github.com/intel/torch-xpu-ops/issues/1616, 2.8 skipped
+        "test_init_from_local_shards",
+        "test_init_from_local_shards_and_global_metadata",
+    },
+    "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None,
+    "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None,
+    "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None,
+    "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
+    # (
+    #     # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+    #     # https://github.com/intel/torch-xpu-ops/issues/1508, 2.8 skipped
+    #     "test_tracker_with_activation_checkpointing",
+    # ),
+    "../../../../test/distributed/_tools/test_mem_tracker.py": None,
+    "../../../../test/distributed/_tools/test_memory_tracker.py": None,
+    "../../../../test/distributed/_tools/test_mod_tracker.py": None,
 }
 
 skip_dict_python = {
     "distributed/test_c10d_ops_xccl.py": None,
     "distributed/test_c10d_xccl.py": None,
-    "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
+    # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,
-}
+}
\ No newline at end of file

From 6e54fb84daa243244f3f3554d3eb4f1615d0a3c5 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:25:50 +0000
Subject: [PATCH 62/83] add distributed ut in CI

---
 .github/scripts/ut_result_check.sh |   7 +-
 .github/workflows/_linux_ut.yml    | 140 +++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 260672ce0..cea234d31 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -134,10 +134,9 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
-
 if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
-    grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
-    grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log
+    grep -E "^FAILED" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
+    grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log
     compare_and_filter_logs "${ut_suite}"_test_failed.log Known_issue.log
     if [[ -f "${ut_suite}_test_failed_filtered.log" ]]; then
       num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed_filtered.log")
@@ -155,4 +154,4 @@ if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distribute
     else
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
-fi
+fi
\ No newline at end of file
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 12cf823f5..8f13b267a 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -601,3 +601,143 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
           path: ${{ github.workspace }}/ut_log
+
+  pytorch_distributed_test:
+    runs-on: ${{ inputs.runner }}
+    if: contains(inputs.ut, 'pytorch_distributed')
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/daisyden/pytorch.git pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Triton Installation
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          if [ -z ${{ inputs.triton }} ]; then
+            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
+          else
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          fi
+          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
+          if [[ ${{ inputs.abi }} == '0' ]]; then
+            export _GLIBCXX_USE_CXX11_ABI=0
+          else
+            export _GLIBCXX_USE_CXX11_ABI=1
+          fi
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
+            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Distributed UT
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install pytest
+          cd ${{ github.workspace }}
+          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
+          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
+          mkdir -p ut_log/pytorch_distributed
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
+            echo -e "[ERROR] XCCL is not enabled"
+            exit 1
+          fi
+          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          cd ${{ github.workspace }}
+          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: UT Test Results Check
+        shell: bash
+        run: |
+          function contains() {
+              contains_status="echo 'Start $2 ...'"
+              {
+                [[ $1 =~ (^|,)$2($|,) ]]
+              } || {
+                echo "[Warning] $2 is not suppotted type! Skipped!"
+                contains_status="continue"
+              }
+          }
+          set -xe
+          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          cd ${{ github.workspace }}/ut_log/pytorch_distributed
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          bash ut_result_check.sh 'pytorch_distributed'
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log

From 5659efd4c2654a7abf24f86e6c9795c44a160d6b Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Sat, 5 Apr 2025 19:52:17 +0000
Subject: [PATCH 63/83] update if condition

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 69a843e35..d8c7eb2c5 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -86,20 +86,22 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-          fi
-          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-          # apply PRs for stock pytorch
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git status && git show -s
-          git submodule sync && git submodule update --init --recursive
-          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-            echo "Don't replace torch-xpu-ops!"
-          else
-            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-            # Workaround for torch-xpu-ops ci test
-            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
           fi
 >>>>>>> 62e9ff75 (add distributed ut in CI)
       - name: Build Pytorch XPU

From 596f231d0117a60c4600002935b4cd57bfb15db1 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 13:37:10 +0000
Subject: [PATCH 64/83] update pytorch build

---
 .github/workflows/_linux_build.yml | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index d8c7eb2c5..69a843e35 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -86,22 +86,20 @@ jobs:
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            git submodule sync && git submodule update --init --recursive
           else
             git clone https://github.com/pytorch/pytorch pytorch
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            # apply PRs for stock pytorch
-            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-            git status && git show -s
-            git submodule sync && git submodule update --init --recursive
-            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-              echo "Don't replace torch-xpu-ops!"
-            else
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-              # Workaround for torch-xpu-ops ci test
-              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-            fi
+          fi
+          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+          # apply PRs for stock pytorch
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
+          else
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
           fi
 >>>>>>> 62e9ff75 (add distributed ut in CI)
       - name: Build Pytorch XPU

From 2b958e1d944f0b1d4c75d23eb767a284304448fc Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 14:55:26 +0000
Subject: [PATCH 65/83] update if condition

---
 .github/workflows/_linux_ut.yml | 2 +-
 .github/workflows/pull.yml      | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 8f13b267a..592ba338f 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -44,7 +44,7 @@ permissions: read-all
 jobs:
   ut_test:
     runs-on: ${{ inputs.runner }} 
-    if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }}
+    if: ${{ inputs.ut != 'xpu_distributed' && inputs.ut != 'pytorch_distributed' }}
     timeout-minutes: 900
     env:
       GH_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index ee0caaac8..24f9e2b1e 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -78,7 +78,6 @@ jobs:
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: distributed_2.8
-      keep_torch_xpu_ops: true
       runner: pvc_e2e
 
   preci-ut-distributed:

From 5d9a340656f4ecbbb09ac4c92c44a0a078159d89 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Mon, 7 Apr 2025 18:12:34 +0000
Subject: [PATCH 66/83] resolve Artifact name conflict

---
 .github/workflows/_linux_build.yml |  4 ++--
 .github/workflows/_linux_ut.yml    | 15 +++++----------
 .github/workflows/pull.yml         |  4 ++--
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 69a843e35..630debe92 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -189,13 +189,13 @@ jobs:
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
           path: ${{ github.workspace }}/torch*.whl
       - name: Upload Build Log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
           path: ${{ github.workspace }}/pytorch_*.log
       - name: Cleanup
         if: always()
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 592ba338f..74e1b92a4 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -97,7 +97,7 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
@@ -381,7 +381,7 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
@@ -515,18 +515,13 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
           source .github/scripts/env.sh ${{ inputs.pytorch }}
           pip install mkl-static==2025.0.1 mkl-include==2025.0.1
-          if [[ ${{ inputs.abi }} == '0' ]]; then
-            export _GLIBCXX_USE_CXX11_ABI=0
-          else
-            export _GLIBCXX_USE_CXX11_ABI=1
-          fi
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
             cd ../pytorch
             export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
@@ -575,7 +570,7 @@ jobs:
             echo -e "[ERROR] XCCL is not enabled"
             exit 1
           fi
-          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
+          python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
           cd ${{ github.workspace }}
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
       - name: UT Test Results Check
@@ -599,7 +594,7 @@ jobs:
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed
           path: ${{ github.workspace }}/ut_log
 
   pytorch_distributed_test:
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 24f9e2b1e..472a48927 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -88,7 +88,7 @@ jobs:
     needs: preci-linux-build-distributed
     uses: ./.github/workflows/_linux_ut.yml
     with:
-      pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
+      pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }}
       ut: pytorch_distributed
       runner: pvc_e2e
 
@@ -137,7 +137,7 @@ jobs:
       - name: Download Pytorch wheel
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ needs.preci-linux-build.outputs.torch_commit_id }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |

From 3fd92a3344ad65bfb0901e61ff6027916d4ca31d Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Wed, 9 Apr 2025 23:57:58 -0700
Subject: [PATCH 67/83] add FSDP2 cases, improved check-ut.py for summary, do
 ZE_AFFINITY_MASK configuration before import torch

---
 .github/scripts/check-ut.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 5758c4e6d..a66b4b095 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -256,4 +256,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 137272cbdc9fbc251b3eb0444f55562abc3d18d0 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Fri, 18 Apr 2025 02:17:54 -0700
Subject: [PATCH 68/83] Skipped error cases

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/xpu_test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 7b2bac5e6..a191f349b 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -1180,4 +1180,4 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml "
             + test_case
         )
-    return os.system(test_command)
+    return os.system(test_command)
\ No newline at end of file

From 5cd5b16f24f2de3c555bd0bb90806db07f800abc Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 17:03:10 +0000
Subject: [PATCH 69/83] update ut

---
 .github/workflows/_linux_ut.yml | 140 --------------------------------
 1 file changed, 140 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 74e1b92a4..cd4b52c60 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -596,143 +596,3 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed
           path: ${{ github.workspace }}/ut_log
-
-  pytorch_distributed_test:
-    runs-on: ${{ inputs.runner }}
-    if: contains(inputs.ut, 'pytorch_distributed')
-    timeout-minutes: 900
-    env:
-      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v4
-      - name: Prepare Stock Pytorch
-        run: |
-          pwd
-          which conda && conda clean -ay
-          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
-                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
-          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../ && rm -rf pytorch
-          pip install requests
-          git clone https://github.com/daisyden/pytorch.git pytorch
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
-            # apply PRs for stock pytorch
-            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-            git status && git show -s
-            git submodule sync && git submodule update --init --recursive
-            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-              echo "Don't replace torch-xpu-ops!"
-            else
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-              # Workaround for torch-xpu-ops ci test
-              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-            fi
-          fi
-      - name: Triton Installation
-        run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../pytorch
-          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
-          if [ -z ${{ inputs.triton }} ]; then
-            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
-          else
-            TRITON_COMMIT_ID="${{ inputs.triton }}"
-          fi
-          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
-          fi
-      - name: Download Pytorch wheel
-        if: ${{ inputs.pytorch != 'nightly_wheel' }}
-        uses: actions/download-artifact@v4
-        with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
-          path: ${{ github.workspace }}
-      - name: Install Pytorch XPU
-        run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          source .github/scripts/env.sh ${{ inputs.pytorch }}
-          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
-          if [[ ${{ inputs.abi }} == '0' ]]; then
-            export _GLIBCXX_USE_CXX11_ABI=0
-          else
-            export _GLIBCXX_USE_CXX11_ABI=1
-          fi
-          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            cd ../pytorch
-            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-            pip install -r requirements.txt
-            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
-            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
-          else
-            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
-            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
-            cd ../pytorch
-            git reset --hard && git checkout ${TORCH_COMMIT_ID}
-            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
-            rm -rf third_party/torch-xpu-ops
-            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
-            cd third_party/torch-xpu-ops
-            git checkout ${TORCH_XPU_OPS_COMMIT}
-            cd ../..
-            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          fi
-          pip install -r .ci/docker/requirements-ci.txt
-      - name: Torch Config
-        run: |
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          source .github/scripts/env.sh ${{ inputs.pytorch }}
-          python -c "import torch; print(torch.__config__.show())"
-          python -c "import torch; print(torch.__config__.parallel_info())"
-          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
-          python -c "import triton; print(triton.__version__)"
-
-          cd ..
-          python pytorch/torch/utils/collect_env.py
-          rm -rf /tmp/torchinductor_*
-          rm -rf ~/.triton/cache
-      - name: Run Torch XPU Distributed UT
-        run: |
-          source .github/scripts/env.sh ${{ inputs.pytorch }}
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          pip install pytest
-          cd ${{ github.workspace }}
-          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
-          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
-          mkdir -p ut_log/pytorch_distributed
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
-          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
-          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
-            echo -e "[ERROR] XCCL is not enabled"
-            exit 1
-          fi
-          timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
-          cd ${{ github.workspace }}
-          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
-      - name: UT Test Results Check
-        shell: bash
-        run: |
-          function contains() {
-              contains_status="echo 'Start $2 ...'"
-              {
-                [[ $1 =~ (^|,)$2($|,) ]]
-              } || {
-                echo "[Warning] $2 is not suppotted type! Skipped!"
-                contains_status="continue"
-              }
-          }
-          set -xe
-          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          cd ${{ github.workspace }}/ut_log/pytorch_distributed
-          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
-          bash ut_result_check.sh 'pytorch_distributed'
-      - name: Upload Inductor XPU UT Log
-        if: ${{ ! cancelled() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed
-          path: ${{ github.workspace }}/ut_log

From 0789c991c74a169c22fcc73ae92b4d612c100e1a Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Wed, 16 Apr 2025 02:30:33 -0700
Subject: [PATCH 70/83] add distributed ut summary

---
 .github/workflows/_linux_ut.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index cd4b52c60..3c90b13de 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -572,7 +572,14 @@ jobs:
           fi
           python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
           cd ${{ github.workspace }}
+          mkdir -p ut_log/pytorch_distributed_summary
+          cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: Distributed UT Test Results Summary
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install junitparser
+          python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/pytorch_distributed_summary/* >> $GITHUB_STEP_SUMMARY || true
       - name: UT Test Results Check
         shell: bash
         run: |

From 4f6bd8d87180124ea08beb145f5a33819854286f Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Wed, 16 Apr 2025 02:32:39 -0700
Subject: [PATCH 71/83] align the path

---
 .github/workflows/_linux_ut.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 3c90b13de..e0b075bd5 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -564,6 +564,7 @@ jobs:
           sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
           sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
           mkdir -p ut_log/pytorch_distributed
+          mkdir -p ut_log/pytorch_distributed_summary
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu
           XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
           if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
@@ -571,9 +572,8 @@ jobs:
             exit 1
           fi
           python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
-          cd ${{ github.workspace }}
-          mkdir -p ut_log/pytorch_distributed_summary
           cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
+          cd ${{ github.workspace }}
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
       - name: Distributed UT Test Results Summary
         run: |

From bc65a513587ab35d289fd41c57d9fe3835a2b93e Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 17:50:43 +0000
Subject: [PATCH 72/83] update

---
 .github/workflows/_linux_ut.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index e0b075bd5..0e4b6521c 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -346,7 +346,7 @@ jobs:
                 rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
           conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
           source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../ && rm -rf pytorch
+          cd ../ && sudo rm -rf pytorch
           pip install requests
           git clone https://github.com/pytorch/pytorch pytorch
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
@@ -564,7 +564,6 @@ jobs:
           sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
           sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
           mkdir -p ut_log/pytorch_distributed
-          mkdir -p ut_log/pytorch_distributed_summary
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu
           XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
           if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
@@ -572,8 +571,9 @@ jobs:
             exit 1
           fi
           python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
-          cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
           cd ${{ github.workspace }}
+          mkdir -p ut_log/pytorch_distributed_summary
+          cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
       - name: Distributed UT Test Results Summary
         run: |

From a86dc57592c9d5c55a494de2154d834bea9c29f9 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 18:01:42 +0000
Subject: [PATCH 73/83] update

---
 .github/workflows/_linux_ut.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 0e4b6521c..d25f71b04 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -62,7 +62,7 @@ jobs:
                 rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
           conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
           source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../ && rm -rf pytorch
+          cd ../ && sudo rm -rf pytorch
           pip install requests
           git clone https://github.com/pytorch/pytorch pytorch
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then

From 0cdcce88a68a59850090ed876de50129f71cb04b Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 16 Apr 2025 18:23:09 +0000
Subject: [PATCH 74/83] update

---
 .github/workflows/_linux_build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 630debe92..e942d990a 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -82,7 +82,7 @@ jobs:
                 rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_build
           conda create -n xpu_build python=${{ inputs.python }} cmake=3.28 ninja -y
           source activate xpu_build
-          cd ../ && rm -rf pytorch
+          cd ../ && sudo rm -rf pytorch
           pip install requests
           if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
             git clone https://github.com/daisyden/pytorch.git pytorch

From e894f691964da3372fe2345db1fdb37ea2ad1a03 Mon Sep 17 00:00:00 2001
From: "Zhong, Ruijie" <ruijie.zhong@intel.com>
Date: Wed, 16 Apr 2025 18:18:08 -0700
Subject: [PATCH 75/83] align the path

---
 .github/workflows/_linux_ut.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index d25f71b04..ab1f741fe 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -564,16 +564,16 @@ jobs:
           sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
           sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
           mkdir -p ut_log/pytorch_distributed
+          mkdir -p ut_log/pytorch_distributed_summary
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu
           XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
           if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
             echo -e "[ERROR] XCCL is not enabled"
             exit 1
           fi
-          python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log
-          cd ${{ github.workspace }}
-          mkdir -p ut_log/pytorch_distributed_summary
+          python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log  
           cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
+          cd ${{ github.workspace }}
           sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
       - name: Distributed UT Test Results Summary
         run: |

From a286091fdf34f8532e18f14a051961ddf595a652 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 16 May 2025 18:17:37 +0000
Subject: [PATCH 76/83] fix yml

---
 .github/workflows/_linux_build.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index e942d990a..1f520aa6d 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -101,7 +101,6 @@ jobs:
             # Workaround for torch-xpu-ops ci test
             sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
           fi
->>>>>>> 62e9ff75 (add distributed ut in CI)
       - name: Build Pytorch XPU
         run: |
           set -xe

From 97504b41087ecf2381db9a3303469510f4a43e92 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Tue, 20 May 2025 19:40:07 -0700
Subject: [PATCH 77/83] remove invalid case

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 05942858c..13dda8eba 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -181,7 +181,6 @@
         "test_transformer_req_grad_float64_thaw_all",
         "test_transformer_training_is_seq_parallel_False_float64",
         "test_transformer_training_is_seq_parallel_True_float64",
-        "test_sequence_parallel_style",
     ),
     "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
     "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": (

From 5ca55a9f82d2e891adb21f90823f9e1af426f090 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Thu, 22 May 2025 02:39:33 -0700
Subject: [PATCH 78/83] Use python instead of pytest to run
 test_c10d_functional_native.py

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 13dda8eba..0e773ea1f 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -63,7 +63,6 @@
     "../../../../test/distributed/fsdp/test_wrap.py": None,
     "../../../../test/distributed/test_backends.py": None,
     "../../../../test/distributed/test_c10d_common.py": None,
-    "../../../../test/distributed/test_c10d_functional_native.py": None,
     "../../../../test/distributed/test_c10d_logger.py": None,
     "../../../../test/distributed/test_c10d_object_collectives.py": (
         # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal!
@@ -338,6 +337,7 @@
 skip_dict_python = {
     "distributed/test_c10d_ops_xccl.py": None,
     "distributed/test_c10d_xccl.py": None,
+    "../../../../test/distributed/test_c10d_functional_native.py": None,
     # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
     "../../../../test/distributed/pipelining/test_stage.py": None,
     "../../../../test/distributed/pipelining/test_transformer.py": None,

From 638767040e4e3263ec41b7334891c31ccd763990 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Thu, 22 May 2025 20:29:01 -0700
Subject: [PATCH 79/83] enable libfrabric WA

---
 test/xpu/run_distributed_local.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
index 94fbfae32..3008a8fc1 100644
--- a/test/xpu/run_distributed_local.py
+++ b/test/xpu/run_distributed_local.py
@@ -9,6 +9,9 @@
 fail_test = []
 error_log = ""
 
+# libfabric WA to hang issue
+os.environ["FI_PROVIDER"] = "tcp"
+
 os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
 # Get the xelink group card affinity
 ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")

From 219de35bf50f27702fdca07e69b8e3ef28a6fc74 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Tue, 27 May 2025 00:02:04 -0700
Subject: [PATCH 80/83] Add accuracy issue to skip list

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 test/xpu/skip_list_dist_local.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index 0e773ea1f..c2d305c7e 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -72,7 +72,11 @@
     ),
     "../../../../test/distributed/test_compute_comm_reordering.py": None,
     "../../../../test/distributed/test_control_collectives.py": None,
-    "../../../../test/distributed/test_device_mesh.py": None,
+    "../../../../test/distributed/test_device_mesh.py": {
+        # RuntimeError: Process 1 exited with error code 10 and exception:
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_scatter_1d",
+    },
     "../../../../test/distributed/test_dynamo_distributed.py": (
         # AssertionError: 'setattr() on Tensor.requires_grad' not found in 'Attempted to call function marked as skipped
         # https://github.com/intel/torch-xpu-ops/issues/1667, 2.8 skipped

From 92b3cad5e4103c18ae2a70c52845e55c3ed7ebcf Mon Sep 17 00:00:00 2001
From: xiangdong <40376367+zxd1997066@users.noreply.github.com>
Date: Sun, 1 Jun 2025 02:22:42 +0800
Subject: [PATCH 81/83] fix skip_list_dist_local.py typo

---
 test/xpu/skip_list_dist_local.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index c2d305c7e..b06449f00 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -72,11 +72,11 @@
     ),
     "../../../../test/distributed/test_compute_comm_reordering.py": None,
     "../../../../test/distributed/test_control_collectives.py": None,
-    "../../../../test/distributed/test_device_mesh.py": {
+    "../../../../test/distributed/test_device_mesh.py": (
         # RuntimeError: Process 1 exited with error code 10 and exception:
         # https://jira.devtools.intel.com/browse/MLSL-3625
         "test_scatter_1d",
-    },
+    ),
     "../../../../test/distributed/test_dynamo_distributed.py": (
         # AssertionError: 'setattr() on Tensor.requires_grad' not found in 'Attempted to call function marked as skipped
         # https://github.com/intel/torch-xpu-ops/issues/1667, 2.8 skipped
@@ -306,7 +306,7 @@
         "test_custom_sharder",
     ),
     "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None,
-    "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": {
+    "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": (
         # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
         # https://github.com/intel/torch-xpu-ops/issues/1617, 2.8 skipped
         "test_complete_world_size",
@@ -323,7 +323,7 @@
         # https://github.com/intel/torch-xpu-ops/issues/1616, 2.8 skipped
         "test_init_from_local_shards",
         "test_init_from_local_shards_and_global_metadata",
-    },
+    ),
     "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None,
     "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None,
     "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None,

From fa10de24bda0b0b9900317d166f889888780dc29 Mon Sep 17 00:00:00 2001
From: xiangdong <40376367+zxd1997066@users.noreply.github.com>
Date: Wed, 11 Jun 2025 17:53:26 +0800
Subject: [PATCH 82/83] improve ut_result_check.sh distributed part

---
 .github/scripts/ut_result_check.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 51fb13a12..07dad4c7b 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -134,8 +134,9 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
     fi
 fi
 if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
-    grep -E "^FAILED" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log
-    grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log
+    grep -E "^FAILED" "${ut_suite}"_test.log | awk '{print $3}' > ./"${ut_suite}"_test_failed.log
+    # grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log
+    sed -i '/^[^.]\+/d' ./"${ut_suite}"_test_failed.log
     compare_and_filter_logs "${ut_suite}"_test_failed.log Known_issue.log
     if [[ -f "${ut_suite}_test_failed_filtered.log" ]]; then
       num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed_filtered.log")
@@ -153,4 +154,4 @@ if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distribute
     else
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
-fi
\ No newline at end of file
+fi

From 24a5faf39417f51d489dad5323223be9f59986f1 Mon Sep 17 00:00:00 2001
From: xiangdong <40376367+zxd1997066@users.noreply.github.com>
Date: Sun, 29 Jun 2025 19:21:33 +0800
Subject: [PATCH 83/83] skip two new cases which do not have related
 environment variable

---
 test/xpu/skip_list_dist_local.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
index b06449f00..5b2460df1 100644
--- a/test/xpu/skip_list_dist_local.py
+++ b/test/xpu/skip_list_dist_local.py
@@ -256,6 +256,9 @@
         # ValueError: Cannot use ReduceOp.PREMUL_SUM with XCCL 
         # https://github.com/intel/torch-xpu-ops/issues/1571, 2.8 skipped
         "test_set_reduce_scatter_divide_factor",
+        # NO related environment variable on XPU
+        "test_fully_shard_force_sum_both_reductions",
+        "test_fully_shard_force_sum_reduce_scatter",
     ),
     "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": (
         # torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised