diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py
index 5d4b189e8..f522a468c 100644
--- a/.github/scripts/check-ut.py
+++ b/.github/scripts/check-ut.py
@@ -261,4 +261,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index a6d94ce41..07dad4c7b 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -133,19 +133,20 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
       echo -e "[PASS] UT ${ut_suite} test Pass"
     fi
 fi
-if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
-    grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
-    grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_xpu_distributed_test_failed.log
-    compare_and_filter_logs "${ut_suite}"_xpu_distributed_test_failed.log Known_issue.log
-    if [[ -f "${ut_suite}_xpu_distributed_test_failed_filtered.log" ]]; then
-      num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed_filtered.log")
+if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then
+    grep -E "^FAILED" "${ut_suite}"_test.log | awk '{print $3}' > ./"${ut_suite}"_test_failed.log
+    # grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log
+    sed -i '/^[^.]\+/d' ./"${ut_suite}"_test_failed.log
+    compare_and_filter_logs "${ut_suite}"_test_failed.log Known_issue.log
+    if [[ -f "${ut_suite}_test_failed_filtered.log" ]]; then
+      num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed_filtered.log")
     else
-      num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log")
+      num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed.log")
     fi
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite} xpu distributed"
     echo -e "========================================================================="
-    cat "./${ut_suite}_xpu_distributed_test_failed.log"
+    cat "./${ut_suite}_test_failed.log"
     ((num_failed=num_failed_xpu_distributed))
     if [[ $num_failed -gt 0 ]]; then
       echo -e "[ERROR] UT ${ut_suite} test Fail"
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 1b2980605..9bbe03873 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -73,8 +73,33 @@ jobs:
           pip install -U pip wheel setuptools
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
-        with:
-          path: torch-xpu-ops
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_build || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_build
+          conda create -n xpu_build python=${{ inputs.python }} cmake=3.28 ninja -y
+          source activate xpu_build
+          cd ../ && sudo rm -rf pytorch
+          pip install requests
+          if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then
+            git clone https://github.com/daisyden/pytorch.git pytorch
+          else
+            git clone https://github.com/pytorch/pytorch pytorch
+          fi
+          cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+          # apply PRs for stock pytorch
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
+          else
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+          fi
       - name: Build Pytorch XPU
         run: |
           set -xe -o pipefail
@@ -121,13 +146,13 @@ jobs:
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
           path: ${{ github.workspace }}/torch*.whl
       - name: Upload Build Log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
-          name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }}
           path: ${{ github.workspace }}/pytorch_*.log
       - name: Cleanup
         if: always()
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 35a4d2f37..f4043f92c 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -104,7 +104,7 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
@@ -396,7 +396,7 @@ jobs:
                 rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
           conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
           source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../ && rm -rf pytorch
+          cd ../ && sudo rm -rf pytorch
           pip install requests
           git clone https://github.com/pytorch/pytorch pytorch
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
@@ -432,7 +432,7 @@ jobs:
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
         with:
-          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
           path: ${{ github.workspace }}
       - name: Install Pytorch XPU
         run: |
@@ -554,3 +554,287 @@ jobs:
         with:
           name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed-checked
           path: ${{ github.workspace }}/ut_log
+
+  pytorch_distributed_test:
+    runs-on: ${{ inputs.runner }}
+    if: contains(inputs.ut, 'pytorch_distributed')
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/daisyden/pytorch.git pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Triton Installation
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          if [ -z ${{ inputs.triton }} ]; then
+            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
+          else
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          fi
+          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
+            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Distributed UT
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install pytest
+          cd ${{ github.workspace }}
+          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
+          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
+          mkdir -p ut_log/pytorch_distributed
+          mkdir -p ut_log/pytorch_distributed_summary
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
+            echo -e "[ERROR] XCCL is not enabled"
+            exit 1
+          fi
+          python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log  
+          cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
+          cd ${{ github.workspace }}
+          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: Distributed UT Test Results Summary
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install junitparser
+          python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/pytorch_distributed_summary/* >> $GITHUB_STEP_SUMMARY || true
+      - name: UT Test Results Check
+        shell: bash
+        run: |
+          function contains() {
+              contains_status="echo 'Start $2 ...'"
+              {
+                [[ $1 =~ (^|,)$2($|,) ]]
+              } || {
+                echo "[Warning] $2 is not suppotted type! Skipped!"
+                contains_status="continue"
+              }
+          }
+          set -xe
+          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          cd ${{ github.workspace }}/ut_log/pytorch_distributed
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          bash ut_result_check.sh 'pytorch_distributed'
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log
+
+  pytorch_distributed_test:
+    runs-on: ${{ inputs.runner }}
+    if: contains(inputs.ut, 'pytorch_distributed')
+    timeout-minutes: 900
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../ && rm -rf pytorch
+          pip install requests
+          git clone https://github.com/daisyden/pytorch.git pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
+          fi
+      - name: Triton Installation
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          if [ -z ${{ inputs.triton }} ]; then
+            TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
+          else
+            TRITON_COMMIT_ID="${{ inputs.triton }}"
+          fi
+          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          fi
+      - name: Download Pytorch wheel
+        if: ${{ inputs.pytorch != 'nightly_wheel' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }}
+          path: ${{ github.workspace }}
+      - name: Install Pytorch XPU
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
+            export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+            pip install -r requirements.txt
+            pip install --force-reinstall ${{ github.workspace }}/torch*.whl
+            git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          else
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
+            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          fi
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Torch Config
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          python -c "import torch; print(torch.__config__.show())"
+          python -c "import torch; print(torch.__config__.parallel_info())"
+          python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
+          python -c "import triton; print(triton.__version__)"
+
+          cd ..
+          python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
+      - name: Run Torch XPU Distributed UT
+        run: |
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install pytest
+          cd ${{ github.workspace }}
+          sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
+          sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
+          mkdir -p ut_log/pytorch_distributed
+          mkdir -p ut_log/pytorch_distributed_summary
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
+          if [[ "${XCCL_ENABLE}}" == 'False' ]]; then
+            echo -e "[ERROR] XCCL is not enabled"
+            exit 1
+          fi
+          python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log  
+          cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary
+          cd ${{ github.workspace }}
+          sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
+      - name: Distributed UT Test Results Summary
+        run: |
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          pip install junitparser
+          python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/pytorch_distributed_summary/* >> $GITHUB_STEP_SUMMARY || true
+      - name: UT Test Results Check
+        shell: bash
+        run: |
+          function contains() {
+              contains_status="echo 'Start $2 ...'"
+              {
+                [[ $1 =~ (^|,)$2($|,) ]]
+              } || {
+                echo "[Warning] $2 is not suppotted type! Skipped!"
+                contains_status="continue"
+              }
+          }
+          set -xe
+          echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          cd ${{ github.workspace }}/ut_log/pytorch_distributed
+          cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
+          bash ut_result_check.sh 'pytorch_distributed'
+      - name: Upload Inductor XPU UT Log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed
+          path: ${{ github.workspace }}/ut_log
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 74b9761d2..4dfc1bd85 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -110,6 +110,31 @@ jobs:
       disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }}
       ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed
       runner: linux.idc.xpu
+  
+  preci-linux-build-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-lint-check
+    permissions:
+      issues: write
+    uses: ./.github/workflows/_linux_build.yml
+    with:
+      pytorch: distributed_2.8
+      runner: pvc_e2e
+
+  preci-ut-distributed:
+    # Don't run on forked repos and draft PRs
+    secrets: inherit
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    name: preci-linux-distributed
+    needs: preci-linux-build-distributed
+    uses: ./.github/workflows/_linux_ut.yml
+    with:
+      pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }}
+      ut: pytorch_distributed
+      runner: pvc_e2e
 
   preci-linux-e2e:
     if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }}
diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py
new file mode 100644
index 000000000..3008a8fc1
--- /dev/null
+++ b/test/xpu/run_distributed_local.py
@@ -0,0 +1,138 @@
+import os
+import subprocess
+import sys
+
+from skip_list_dist_local import skip_dict, skip_dict_python
+
+res = 0
+res2 = 0
+fail_test = []
+error_log = ""
+
+# libfabric WA to hang issue
+os.environ["FI_PROVIDER"] = "tcp"
+
+os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining"
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
+if ret == 0:
+    gpu_dict = {}
+    with open("topology.log") as file:
+        lines = file.readlines()
+        for line in lines:
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
+
+    max_affinity = ""
+    for key, value in gpu_dict.items():
+        if len(value) > len(max_affinity):
+            max_affinity = value
+
+    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
+    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
+
+else:
+    print("xpu-smi topology failed")
+    sys.exit(255)
+
+
+from xpu_test_utils import launch_test
+
+
+# run python test
+def run(test_command):
+    result = subprocess.run(test_command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    return result
+
+
+for key in skip_dict_python:
+    skip_list = skip_dict_python[key] if skip_dict_python[key] else []
+    test_command = ["python", key]
+    fail = run(test_command)
+    num_skipped = 0
+    num_err = 0
+    if fail.returncode:
+        for i, err in enumerate(fail.stderr.split("FAIL: ")):
+            if i == 0 and len(err) > 0:
+                error_log += err
+                continue
+            is_skipped = False
+            for skip_case in skip_list:
+                if skip_case in err:
+                    print("Skipped error: ", key + " " + skip_case)
+                    num_skipped += 1
+                    is_skipped = True
+                    break
+            if not is_skipped:
+                num_err += 1
+                res2 += fail.returncode
+                if i == len(fail.stderr.split("FAIL: ")) - 1:
+                    error_log += "FAIL: "
+                    for line in err.split("\n"):
+                        if line.startswith("FAILED (failures="):
+                            num_errs = line.split("=")[1].split(")")[0].strip()
+                            error_log += (
+                                "FAILED (failures="
+                                + str(int(num_errs) - num_skipped)
+                                + f" skipped {num_skipped} cases"
+                                + ")\n"
+                            )
+                        else:
+                            error_log += line + "\n"
+                else:
+                    error_log += "FAIL: " + err
+            else:
+                if i == len(fail.stderr.split("FAIL: ")) - 1:
+                    error_log += "FAIL: "
+                    for line in err.split("\n"):
+                        if line.startswith("FAILED (failures="):
+                            num_errs = line.split("=")[1].split(")")[0].strip()
+                            error_log += (
+                                "FAILED (failures="
+                                + str(int(num_errs) - num_skipped)
+                                + f" skipped {num_skipped} cases"
+                                + ")\n"
+                            )
+
+    renamed_key = key.replace("../../../../", "").replace("/", "_")
+    if num_err > 0:
+        fail_test.append(key)
+        with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
+            f.write(error_log)
+    else:
+        with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f:
+            f.write(fail.stdout)
+            f.write(fail.stderr)
+
+# run pytest with skiplist
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    fail = launch_test(key, skip_list)
+    res += fail
+    if fail:
+        fail_test.append(key)
+
+if fail_test:
+    print(",".join(fail_test) + " have failures")
+
+exit_code = os.WEXITSTATUS(res)
+if exit_code == 0:
+    sys.exit(res2)
+else:
+    sys.exit(exit_code)
diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py
new file mode 100644
index 000000000..5b2460df1
--- /dev/null
+++ b/test/xpu/skip_list_dist_local.py
@@ -0,0 +1,351 @@
+skip_dict = {
+    "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
+    # https://github.com/intel/torch-xpu-ops/issues/1536
+    # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": (
+    #    "test_distributed_checkpoint_state_dict_type0_xpu",
+    #    "test_distributed_checkpoint_state_dict_type1_xpu",
+    # ),
+    "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
+        # Accuracy gap in FSDP checkpoint related UT
+        # https://github.com/intel/torch-xpu-ops/issues/1666, 2.8 skipped
+        "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
+        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
+        "test_checkpoint_submodule_use_reentrant_False_xpu",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": (
+        # fsdp accuracy gaps
+        # https://github.com/intel/torch-xpu-ops/issues/1504, Performance test, should skip
+        "test_fsdp_optimizer_overlap",
+    ),
+    "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": None,
+    # Performance check, skip
+    # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": (
+    #    # fsdp accuracy gaps
+        # https://github.com/intel/torch-xpu-ops/issues/1504
+    #    "test_forward_overlap",
+    #    "test_forward_overlap_xpu",
+    # ),
+    "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None,
+    "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
+    "../../../../test/distributed/fsdp/test_shard_utils.py": None,
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+    "../../../../test/distributed/fsdp/test_wrap.py": None,
+    "../../../../test/distributed/test_backends.py": None,
+    "../../../../test/distributed/test_c10d_common.py": None,
+    "../../../../test/distributed/test_c10d_logger.py": None,
+    "../../../../test/distributed/test_c10d_object_collectives.py": (
+        # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal!
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_scatter_object_list_cpu",
+        "test_scatter_object_list_xpu",
+    ),
+    "../../../../test/distributed/test_compute_comm_reordering.py": None,
+    "../../../../test/distributed/test_control_collectives.py": None,
+    "../../../../test/distributed/test_device_mesh.py": (
+        # RuntimeError: Process 1 exited with error code 10 and exception:
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_scatter_1d",
+    ),
+    "../../../../test/distributed/test_dynamo_distributed.py": (
+        # AssertionError: 'setattr() on Tensor.requires_grad' not found in 'Attempted to call function marked as skipped
+        # https://github.com/intel/torch-xpu-ops/issues/1667, 2.8 skipped
+        "test_fsdp_setattr",
+    ),
+    "../../../../test/distributed/test_fake_pg.py": None,
+    "../../../../test/distributed/test_functional_api.py": None,
+    "../../../../test/distributed/test_inductor_collectives.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1581, 2.8 skipped
+        # Fatal Python error: Segmentation fault
+        "test_dynamo_rewrite_dist_all_gather",
+        "test_dynamo_rewrite_dist_all_gather_list",
+        "test_dynamo_rewrite_dist_all_gather_args_match",
+        "test_dynamo_rewrite_dist_reduce_scatter",
+        "test_dynamo_support_collective_op_with_async_op_False",
+        "test_dynamo_trace_reduce_scatter_tensor",
+        "test_dynamo_trace_all_gather_tensor",
+        "test_dynamo_trace_allgather_coalesced",
+        "test_inductor_reduce_scatter_coalesced",
+        "test_inductor_all_gather_coalesced",
+        "test_reorder_peak_memory",
+    ),
+    "../../../../test/distributed/test_multi_threaded_pg.py": None,
+    "../../../../test/distributed/test_store.py": None,
+    "../../../../test/distributed/pipelining/test_backward.py": None,
+    # (
+    #     # fsdp accuracy gaps
+          # https://github.com/intel/torch-xpu-ops/issues/1504
+    #     "test_stage_backward_weight_multiple_iters_xpu",
+    #     "test_stage_backward_weight_xpu",
+    #     "test_stage_backward_xpu",
+    # ),
+    "../../../../test/distributed/pipelining/test_microbatch.py": None,
+    # (
+    #     # fsdp accuracy gaps
+          # https://github.com/intel/torch-xpu-ops/issues/1504, need retest with oneccl fix
+    #     "test_chunk_spec_xpu",
+    # ),
+    "../../../../test/distributed/pipelining/test_pipe.py": None,
+    "../../../../test/distributed/pipelining/test_schedule.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
+    "../../../../test/distributed/pipelining/test_unflatten.py": None,
+    "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": (
+        # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter'
+        # is not currently implemented for the XPU device
+        # https://github.com/intel/torch-xpu-ops/issues/1547, 2.8 skipped
+        "test_dtensor_seq_par_shard_dim_0",
+        "test_dtensor_seq_par_shard_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
+        "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
+        "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
+        # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......'
+        # https://github.com/intel/torch-xpu-ops/issues/1548, 2.8 skipped
+        "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True",
+        # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......'
+        # https://github.com/intel/torch-xpu-ops/issues/1549, 2.8 skipped
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True",
+        # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device.
+        # https://github.com/intel/torch-xpu-ops/issues/1550, 2.8 skipped
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False",
+        "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True",
+        # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter'
+        # is not currently implemented for the XPU device.
+        # https://github.com/intel/torch-xpu-ops/issues/1551, 2.8 skipped
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1",
+        "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2",
+    ),
+    "../../../../test/distributed/tensor/parallel/test_tp_examples.py": (
+        # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!
+        # https://github.com/intel/torch-xpu-ops/issues/1555, 2.8 skipped
+        "test_transformer_req_grad_seq_parallel_float32_thaw_all",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings",
+        "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings",
+        "test_transformer_training_is_seq_parallel_False_float32",
+        "test_transformer_training_is_seq_parallel_True_float32",
+        # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered.
+        # https://github.com/intel/torch-xpu-ops/issues/1556, 2.8 skipped
+        "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output",
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_loss_parallel",
+        "test_mlp_training_is_seq_parallel_False_recompute_activation_False",
+        "test_mlp_training_is_seq_parallel_True_recompute_activation_False",
+        "test_transformer_req_grad_float64_thaw_all",
+        "test_transformer_training_is_seq_parallel_False_float64",
+        "test_transformer_training_is_seq_parallel_True_float64",
+    ),
+    "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None,
+    "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_linear_col_wise_parallel",
+        "test_parallelize_mlp_with_module_api",
+        "test_parallelize_mlp_with_module_api_nested",
+        "test_parallelize_module_multi_wildcard",
+        "test_parallelize_module_src_data_rank",
+        "test_parallelize_module_with_digit",
+        "test_parallelize_module_with_question",
+        "test_parallelize_module_with_star",
+        "test_under_devicemesh_context",
+    ),
+    "../../../../test/distributed/tensor/parallel/test_tp_style.py": None,
+    "../../../../test/distributed/tensor/test_api.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_distribute_tensor_rank",
+        "test_distribute_tensor_uneven_sharding",
+    ),
+    "../../../../test/distributed/tensor/test_attention.py": None,
+    "../../../../test/distributed/tensor/test_common_rules.py": None,
+    "../../../../test/distributed/tensor/test_dtensor.py": None,
+    "../../../../test/distributed/tensor/test_dtensor_compile.py": None,
+    "../../../../test/distributed/tensor/test_experimental_ops.py": None,
+    "../../../../test/distributed/tensor/test_init.py": None,
+    "../../../../test/distributed/tensor/test_math_ops.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_cumsum",
+        "test_layer_norm_bwd",
+        "test_layer_norm_bwd_req_grad",
+        "test_layer_norm_fwd",
+        "test_linear_op_reductions",
+        "test_shard0_svd",
+        "test_softmax_fwd",
+        "test_topk",
+    ),
+    "../../../../test/distributed/tensor/test_random_ops.py": None,
+    "../../../../test/distributed/tensor/test_redistribute.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_redistribute_shard_dim_change",
+        "test_redistribute_uneven_sharding",
+        "test_shard_to_replicate_forward_backward",
+        "test_shard_to_replicate_forward_backward_datatype_conversion",
+        "test_multi_dim_mesh",
+    ),
+    "../../../../test/distributed/tensor/test_tensor_ops.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_aten_contiguous",
+        "test_gather",
+        "test_index",
+        "test_slice",
+        "test_stack",
+        "test_where_type_promotion",
+    ),
+    "../../../../test/distributed/tensor/experimental/test_register_sharding.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_argmax",
+        "test_softmax_fwd",
+    ),
+    # FSDP2
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_nontensor_activations",
+        "test_unused_forward_module",
+        "test_unused_forward_output",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": (
+        # ValueError: Cannot use ReduceOp.PREMUL_SUM with XCCL 
+        # https://github.com/intel/torch-xpu-ops/issues/1571, 2.8 skipped
+        "test_set_reduce_scatter_divide_factor",
+        # NO related environment variable on XPU
+        "test_fully_shard_force_sum_both_reductions",
+        "test_fully_shard_force_sum_reduce_scatter",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": (
+        # torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised
+        # https://github.com/intel/torch-xpu-ops/issues/1665, 2.8 skipped
+        "test_transformer_backend_inductor_fullgraph_True",
+        "test_nested_fully_shard_backend_inductor_fullgraph_True",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_train_mixed_requires_grad_per_group",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_compute_dtype",
+        "test_grad_acc_with_reduce_dtype",
+        "test_reduce_dtype",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": (
+        # Performance test, should skip
+        "test_fully_shard_training_overlap",
+    ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": (
+        # checkpointing issue, 2.8 skipped
+        "test_train_parity_with_activation_checkpointing",
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_1f1b_microbatching",
+        "test_gradient_accumulation",
+    ),
+    "../../../../test/distributed/_composable/test_replicate_with_compiler.py": (
+        # AssertionError: Tensor-likes are not close!
+        # https://github.com/intel/torch-xpu-ops/issues/1668, 2.8 skipped
+        "test_compile_backward_only",
+        "test_compile_bf16",
+        "test_compile_fp16",
+        "test_compile_gpu",
+        "test_compile_gpu_ac",
+    ),
+    "../../../../test/distributed/_shard/test_sharder.py": (
+        # https://jira.devtools.intel.com/browse/MLSL-3625
+        "test_custom_sharder",
+    ),
+    "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None,
+    "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": (
+        # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
+        # https://github.com/intel/torch-xpu-ops/issues/1617, 2.8 skipped
+        "test_complete_world_size",
+        "test_multiple_local_shards",
+        "test_new_group",
+        "test_partial_world_size",
+        "test_grid_sharding",
+        "test_multiple_local_shards",
+        "test_new_group",
+        "test_partial_world_size",
+        "test_with_rpc_names",
+        "test_init_from_local_tensor",
+        # what():  Attempting to send a Tensor with unexpected device type xpu:3
+        # https://github.com/intel/torch-xpu-ops/issues/1616, 2.8 skipped
+        "test_init_from_local_shards",
+        "test_init_from_local_shards_and_global_metadata",
+    ),
+    "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None,
+    "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None,
+    "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None,
+    "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
+    # (
+    #     # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path
+    #     # https://github.com/intel/torch-xpu-ops/issues/1508, 2.8 skipped
+    #     "test_tracker_with_activation_checkpointing",
+    # ),
+    "../../../../test/distributed/_tools/test_mem_tracker.py": None,
+    "../../../../test/distributed/_tools/test_memory_tracker.py": None,
+    "../../../../test/distributed/_tools/test_mod_tracker.py": None,
+}
+
+skip_dict_python = {
+    "distributed/test_c10d_ops_xccl.py": None,
+    "distributed/test_c10d_xccl.py": None,
+    "../../../../test/distributed/test_c10d_functional_native.py": None,
+    # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None,  # Hang error.
+    "../../../../test/distributed/pipelining/test_stage.py": None,
+    "../../../../test/distributed/pipelining/test_transformer.py": None,
+}
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index df524100b..447fc5f43 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -1163,6 +1163,7 @@ def copy_tests(
 def launch_test(test_case, skip_list=None, exe_list=None):
     os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1"
     os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
+    rename = test_case.replace("../../../../", "").replace("/", "_")
     if skip_list is not None:
         skip_options = ' -k "not ' + skip_list[0]
         for skip_case in skip_list[1:]:
@@ -1170,8 +1171,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             skip_options += skip_option
         skip_options += '"'
         test_command = (
-            f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml "
-            + test_case
+            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case
         )
         test_command += skip_options
     elif exe_list is not None:
@@ -1181,13 +1181,11 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             exe_options += exe_option
         exe_options += '"'
         test_command = (
-            f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml "
-            + test_case
+            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case
         )
         test_command += exe_options
     else:
         test_command = (
-            f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml "
-            + test_case
+            f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case
         )
-    return os.system(test_command)
+    return os.system(test_command)
\ No newline at end of file