Merge remote-tracking branch 'origin' into daisyden/distributed_2.8

daisyden · daisyden · commit 0d6f048e8bb8 · 2025-06-20T00:22:49.000-07:00
diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml
@@ -62,13 +62,19 @@ runs:
           fi
           cd ../ && python -c "import torch, torchvision, torchaudio"
           rm -rf benchmark && git clone https://github.com/pytorch/benchmark.git
-          cd benchmark && git checkout $TORCHBENCH_COMMIT_ID && pip install --no-deps -r requirements.txt
+          cd benchmark && git checkout $TORCHBENCH_COMMIT_ID
+          # remove deps which will reinstall torch
+          pip install --no-deps accelerate
+          pip install --no-deps $(cat requirements.txt |grep 'pytorch-image-models')
+          timm_commit="$(grep 'pytorch-image-models' requirements.txt  |awk -F '@' '{print $2}')"
+          pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/${timm_commit:-"main"}/requirements.txt | grep -vE torch)
+          sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g'  requirements.txt
+          pip install -r requirements.txt
           python install.py --continue_on_fail
           # deps for torchrec_dlrm
           pip install pyre_extensions
           pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cpu
-          pip install torchmetrics==1.0.3
-          pip install torchrec --no-deps --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec
         fi
         if [[ ${{ inputs.suite }} == *"huggingface"* ]]; then
           pip install --force-reinstall git+https://github.com/huggingface/transformers@${TRANSFORMERS_VERSION}
@@ -127,7 +133,7 @@ runs:
                 contains "accuracy,performance" $scenario
                 $contains_status
                 if [ "${MODEL_ONLY_NAME}" == "" ];then
-                  xpu_list=($(xpu-smi discovery |grep 'DRM Device: /dev/' |sed 's/.*card//;s/[^0-9].*//' |awk '{print $1 - 1":"NR - 1}'))
+                  xpu_list=($(xpu-smi discovery |grep 'DRM Device: /dev/' |sed 's/.*card//;s/[^0-9].*//' |awk '{if($1==0){print $1":"NR - 1}else{print $1 - 1":"NR - 1}}'))
                   for xpu_id in ${xpu_list[*]}
                   do
                     bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id/:*} static ${#xpu_list[*]} ${xpu_id/*:} &
diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py
@@ -10,12 +10,10 @@
 parser.add_argument('--pr-list', '-n', nargs='+',
     default=[
         # Fallback to CPU for XPU FP64
-        "https://github.com/pytorch/pytorch/pull/126516",
+        "https://github.com/pytorch/pytorch/pull/156456",
         # Modify the tolerance level in TIMM benchmark
         "https://github.com/pytorch/pytorch/pull/143739",
-        # Allow XPU device for validating the arguments to sparse compressed tensor factory functions
-        "https://github.com/pytorch/pytorch/pull/147306",
-        "Enhance testing infrastructure to add half-precision support for histc on XPU"
+        # "Enhance testing infrastructure to add half-precision support for histc on XPU"
         "https://github.com/pytorch/pytorch/pull/154339",
     ]
 )
diff --git a/.github/workflows/_linux_accelerate.yml b/.github/workflows/_linux_accelerate.yml
@@ -39,7 +39,6 @@ on:
         default: 'v4.51.3'
         description: Transformers version
 
-permissions: read-all
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
@@ -38,9 +38,6 @@ on:
         description: The commit id of the torch build
         value: ${{ jobs.build.outputs.TORCH_COMMIT_ID }}
 
-permissions:
-  issues: write
-
 jobs:
   build:
     runs-on: ${{ inputs.runner }}
diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml
@@ -34,8 +34,6 @@ on:
         default: 'rolling'
         description: Driver lts/rolling
 
-permissions: read-all
-
 jobs:
   op_benchmark_test:
     runs-on: ${{ inputs.runner }} 
@@ -81,7 +79,6 @@ jobs:
       - name: Install Pytorch XPU
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
-          source .github/scripts/env.sh ${{ inputs.pytorch }}
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
             cd ../pytorch
             export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
@@ -96,7 +93,6 @@ jobs:
       - name: Torch Config
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
-          source .github/scripts/env.sh ${{ inputs.pytorch }}
           python -c "import torch; print(torch.__config__.show())"
           python -c "import torch; print(torch.__config__.parallel_info())"
           python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
@@ -108,7 +104,6 @@ jobs:
       - name: Run Torch XPU Op Benchmark
         if: ${{ inputs.driver == 'rolling' }} 
         run: |
-          source .github/scripts/env.sh ${{ inputs.pytorch }}
           source activate xpu_op_${ZE_AFFINITY_MASK}
           mkdir -p ${{ github.workspace }}/op_benchmark
           cd test/microbench
diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml
@@ -44,7 +44,6 @@ on:
         default: 'v4.51.3'
         description: Transformers version
 
-permissions: read-all
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
@@ -314,7 +313,7 @@ jobs:
 
   report:
     needs: tests
-    if: "always()"
+    if: ${{ always() }}
     runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
     steps:
       - name: Download reports
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -44,8 +44,6 @@ on:
         default: 'lts'
         description: Driver lts/rolling
 
-permissions: read-all
-
 jobs:
   ut_test:
     runs-on: ${{ inputs.runner }}
@@ -88,15 +86,16 @@ jobs:
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
           cd ../pytorch
-          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
           if [ -z ${{ inputs.triton }} ]; then
             TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
           else
             TRITON_COMMIT_ID="${{ inputs.triton }}"
           fi
-          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+            pip install cmake ninja pybind11
+            rm -rf pytorch_triton_xpu-*.whl
+            python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID}
+            pip install pytorch_triton_xpu-*.whl
           fi
       - name: Download Pytorch wheel
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
@@ -415,15 +414,16 @@ jobs:
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
           cd ../pytorch
-          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
           if [ -z ${{ inputs.triton }} ]; then
             TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
           else
             TRITON_COMMIT_ID="${{ inputs.triton }}"
           fi
-          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
-            pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+            pip install cmake ninja pybind11
+            rm -rf pytorch_triton_xpu-*.whl
+            python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID}
+            pip install pytorch_triton_xpu-*.whl
           fi
       - name: Download Pytorch wheel
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
diff --git a/.github/workflows/_performance_comparison.yml b/.github/workflows/_performance_comparison.yml
@@ -14,8 +14,6 @@ on:
         default: ''
         description: Baseline run id
 
-permissions: read-all
-
 jobs:
   Performance-Comparison:
     env:
diff --git a/.github/workflows/_windows_ut.yml b/.github/workflows/_windows_ut.yml
@@ -44,8 +44,6 @@ on:
         default: 'false'
         description: Check if labelled
 
-permissions: read-all
-
 env: 
     USE_XPU: 1
 
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
@@ -59,19 +59,15 @@ on:
         default: '3.10'
         description: Python version
 
-permissions: write-all
-
 concurrency:
   group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
   cancel-in-progress: ${{ github.event_name != 'schedule' }}
 
 jobs:
   Linux-Nightly-Ondemand-Build:
     if: ${{ github.repository_owner == 'intel' }}
-    secrets: inherit
     name: linux-nightly-ondemand
-    permissions:
-      issues: write
+    secrets: inherit
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
@@ -177,9 +173,10 @@ jobs:
         run: |
           source activate e2e_ci
           cd ../pytorch
-          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
-          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
-          pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          pip install cmake ninja pybind11
+          rm -rf pytorch_triton_xpu-*.whl
+          python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID}
+          pip install pytorch_triton_xpu-*.whl
       - name: Download Pytorch wheel
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
@@ -365,15 +362,13 @@ jobs:
       keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
       ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }}
       python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
-      files-changed: false
-      has-label: true
+      src_changed: false
+      has_label: true
       runner: Windows_CI
 
   Tests-Failure-And-Report:
     if: ${{ ! cancelled() }}
     runs-on: [ self-hosted, Linux ]
-    permissions:
-      issues: write
     env:
       GH_TOKEN: ${{ github.token }}
       python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml
@@ -59,19 +59,15 @@ on:
         default: '3.10'
         description: Python version
 
-permissions: write-all
-
 concurrency:
   group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
   cancel-in-progress: ${{ github.event_name != 'schedule' }}
 
 jobs:
   Linux-Nightly-Ondemand-Build-Rolling:
     if: ${{ github.repository_owner == 'intel' }}
-    secrets: inherit
     name: linux-nightly-ondemand-rolling
-    permissions:
-      issues: write
+    secrets: inherit
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
@@ -193,9 +189,10 @@ jobs:
         run: |
           source activate e2e_ci
           cd ../pytorch
-          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
-          echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
-          pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
+          pip install cmake ninja pybind11
+          rm -rf pytorch_triton_xpu-*.whl
+          python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID}
+          pip install pytorch_triton_xpu-*.whl
       - name: Download Pytorch wheel
         if: ${{ inputs.pytorch != 'nightly_wheel' }}
         uses: actions/download-artifact@v4
@@ -385,8 +382,6 @@ jobs:
   Tests-Failure-And-Report:
     if: ${{ ! cancelled() }}
     runs-on: [ self-hosted, Linux ]
-    permissions:
-      issues: write
     env:
       GH_TOKEN: ${{ github.token }}
       python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml
@@ -49,8 +49,6 @@ on:
         default: '3.10'
         description: Python version
 
-permissions: write-all
-
 concurrency:
   group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.ut }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
   cancel-in-progress: ${{ github.event_name != 'schedule' }}
@@ -322,8 +320,6 @@ jobs:
   Tests-Failure-And-Report:
     if: ${{ ! cancelled() }}
     runs-on: [ self-hosted, Linux ]
-    permissions:
-      issues: write
     env:
       GH_TOKEN: ${{ github.token }}
       python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -17,8 +17,6 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
   cancel-in-progress: true
 
-permissions: read-all
-
 jobs:
   preci-lint-check:
     name: preci-lint-check
@@ -98,8 +96,6 @@ jobs:
     if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all')}}
     needs: [preci-conditions-filter]
     secrets: inherit
-    permissions:
-      issues: write
     uses: ./.github/workflows/_linux_build.yml
     with:
       pytorch: main
@@ -179,10 +175,10 @@ jobs:
         run: |
           source activate e2e_ci
           cd ../pytorch
-          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
-          TRITON_PINNED_COMMIT=$(cat .ci/docker/ci_commit_pins/triton-xpu.txt)
-          echo ${TRITON_REPO}@${TRITON_PINNED_COMMIT}
-          pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
+          pip install cmake ninja pybind11
+          rm -rf pytorch_triton_xpu-*.whl
+          python .github/scripts/build_triton_wheel.py --device xpu
+          pip install pytorch_triton_xpu-*.whl
       - name: Identify pinned versions
         run: |
           cd ../pytorch
diff --git a/src/ATen/native/xpu/SpectralOps.cpp b/src/ATen/native/xpu/SpectralOps.cpp
@@ -1,10 +1,10 @@
-#include <ATen/native/Resize.h>
-#include <ATen/ops/_fft_r2c_native.h>
 #if defined(USE_ONEMKL_XPU)
 #include <ATen/native/xpu/mkl/SpectralOps.h>
 #else
+#include <ATen/native/Resize.h>
 #include <ATen/ops/_fft_c2c_native.h>
 #include <ATen/ops/_fft_c2r_native.h>
+#include <ATen/ops/_fft_r2c_native.h>
 #endif // USE_ONEMKL_XPU
 
 namespace at::native {
@@ -87,9 +87,13 @@ Tensor _fft_r2c_xpu(
     bool onesided) {
   TORCH_CHECK(self.is_floating_point());
 
+#if defined(USE_ONEMKL_XPU)
+  return native::xpu::_fft_r2c_mkl(self, dim, normalization, onesided);
+#else
   Tensor out_cpu = native::_fft_r2c_mkl(
       self.to(Device(at::kCPU)), dim, normalization, onesided);
   return out_cpu.to(Device(at::kXPU));
+#endif // USE_ONEMKL_XPU
 }
 
 Tensor& _fft_r2c_xpu_out(
@@ -100,11 +104,15 @@ Tensor& _fft_r2c_xpu_out(
     Tensor& out) {
   TORCH_CHECK(self.is_floating_point());
 
+#if defined(USE_ONEMKL_XPU)
+  return native::xpu::_fft_r2c_mkl_out(self, dim, normalization, onesided, out);
+#else
   Tensor out_cpu = native::_fft_r2c_mkl(
       self.to(Device(at::kCPU)), dim, normalization, onesided);
   at::native::resize_output(out, out_cpu.sizes());
   out.copy_(out_cpu);
   return out;
+#endif // USE_ONEMKL_XPU
 }
 
 } // namespace at::native
diff --git a/src/ATen/native/xpu/mkl/SpectralOps.cpp b/src/ATen/native/xpu/mkl/SpectralOps.cpp
diff --git a/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp b/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
diff --git a/test/regressions/test_tril.py b/test/regressions/test_tril.py