diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 5d4b189e8..f522a468c 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -261,4 +261,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index a6d94ce41..07dad4c7b 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -133,19 +133,20 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log - grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_xpu_distributed_test_failed.log - compare_and_filter_logs "${ut_suite}"_xpu_distributed_test_failed.log Known_issue.log - if [[ -f "${ut_suite}_xpu_distributed_test_failed_filtered.log" ]]; then - num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed_filtered.log") +if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then + grep -E "^FAILED" "${ut_suite}"_test.log | awk '{print $3}' > ./"${ut_suite}"_test_failed.log + # grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log + sed -i '/^[^.]\+/d' ./"${ut_suite}"_test_failed.log + compare_and_filter_logs "${ut_suite}"_test_failed.log Known_issue.log + if [[ -f "${ut_suite}_test_failed_filtered.log" ]]; then + num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed_filtered.log") else - num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log") + num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed.log") fi echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite} xpu distributed" echo -e "=========================================================================" - cat "./${ut_suite}_xpu_distributed_test_failed.log" + cat "./${ut_suite}_test_failed.log" ((num_failed=num_failed_xpu_distributed)) if [[ $num_failed -gt 0 ]]; then echo -e "[ERROR] UT ${ut_suite} test Fail" diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 1b2980605..9bbe03873 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -73,8 +73,33 @@ jobs: pip install -U pip wheel setuptools - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - with: - path: torch-xpu-ops + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_build || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_build + conda create -n xpu_build python=${{ inputs.python }} cmake=3.28 ninja -y + source activate xpu_build + cd ../ && sudo rm -rf pytorch + pip install requests + if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then + git clone https://github.com/daisyden/pytorch.git pytorch + else + git clone https://github.com/pytorch/pytorch pytorch + fi + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi - name: Build Pytorch XPU run: | set -xe -o pipefail @@ -121,13 +146,13 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/torch*.whl - name: Upload Build Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/pytorch_*.log - name: Cleanup if: always() diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 35a4d2f37..f4043f92c 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -104,7 +104,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -396,7 +396,7 @@ jobs: rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch + cd ../ && sudo rm -rf pytorch pip install requests git clone https://github.com/pytorch/pytorch pytorch if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then @@ -432,7 +432,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -554,3 +554,287 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed-checked path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: Distributed UT Test Results Summary + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + pip install junitparser + python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/pytorch_distributed_summary/* >> $GITHUB_STEP_SUMMARY || true + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: Distributed UT Test Results Summary + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + pip install junitparser + python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/pytorch_distributed_summary/* >> $GITHUB_STEP_SUMMARY || true + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 74b9761d2..4dfc1bd85 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -110,6 +110,31 @@ jobs: disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }} ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu + + preci-linux-build-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-lint-check + permissions: + issues: write + uses: ./.github/workflows/_linux_build.yml + with: + pytorch: distributed_2.8 + runner: pvc_e2e + + preci-ut-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-linux-build-distributed + uses: ./.github/workflows/_linux_ut.yml + with: + pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }} + ut: pytorch_distributed + runner: pvc_e2e preci-linux-e2e: if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }} diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py new file mode 100644 index 000000000..3008a8fc1 --- /dev/null +++ b/test/xpu/run_distributed_local.py @@ -0,0 +1,138 @@ +import os +import subprocess +import sys + +from skip_list_dist_local import skip_dict, skip_dict_python + +res = 0 +res2 = 0 +fail_test = [] +error_log = "" + +# libfabric WA to hang issue +os.environ["FI_PROVIDER"] = "tcp" + +os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) + + +from xpu_test_utils import launch_test + + +# run python test +def run(test_command): + result = subprocess.run(test_command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + return result + + +for key in skip_dict_python: + skip_list = skip_dict_python[key] if skip_dict_python[key] else [] + test_command = ["python", key] + fail = run(test_command) + num_skipped = 0 + num_err = 0 + if fail.returncode: + for i, err in enumerate(fail.stderr.split("FAIL: ")): + if i == 0 and len(err) > 0: + error_log += err + continue + is_skipped = False + for skip_case in skip_list: + if skip_case in err: + print("Skipped error: ", key + " " + skip_case) + num_skipped += 1 + is_skipped = True + break + if not is_skipped: + num_err += 1 + res2 += fail.returncode + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ( + "FAILED (failures=" + + str(int(num_errs) - num_skipped) + + f" skipped {num_skipped} cases" + + ")\n" + ) + else: + error_log += line + "\n" + else: + error_log += "FAIL: " + err + else: + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ( + "FAILED (failures=" + + str(int(num_errs) - num_skipped) + + f" skipped {num_skipped} cases" + + ")\n" + ) + + renamed_key = key.replace("../../../../", "").replace("/", "_") + if num_err > 0: + fail_test.append(key) + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(error_log) + else: + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(fail.stdout) + f.write(fail.stderr) + +# run pytest with skiplist +for key in skip_dict: + skip_list = skip_dict[key] + fail = launch_test(key, skip_list) + res += fail + if fail: + fail_test.append(key) + +if fail_test: + print(",".join(fail_test) + " have failures") + +exit_code = os.WEXITSTATUS(res) +if exit_code == 0: + sys.exit(res2) +else: + sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py new file mode 100644 index 000000000..5b2460df1 --- /dev/null +++ b/test/xpu/skip_list_dist_local.py @@ -0,0 +1,351 @@ +skip_dict = { + "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1536 + # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "test_distributed_checkpoint_state_dict_type0_xpu", + # "test_distributed_checkpoint_state_dict_type1_xpu", + # ), + "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, + "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( + # Accuracy gap in FSDP checkpoint related UT + # https://github.com/intel/torch-xpu-ops/issues/1666, 2.8 skipped + "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", + "test_checkpoint_submodule_use_reentrant_False_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": None, + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None, + "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, + "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, + "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, + "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, + "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504, Performance test, should skip + "test_fsdp_optimizer_overlap", + ), + "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, + "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": None, + # Performance check, skip + # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504 + # "test_forward_overlap", + # "test_forward_overlap_xpu", + # ), + "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, + "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, + "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, + "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None, + "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_shard_utils.py": None, + "../../../../test/distributed/fsdp/test_utils.py": None, + "../../../../test/distributed/fsdp/test_wrap.py": None, + "../../../../test/distributed/test_backends.py": None, + "../../../../test/distributed/test_c10d_common.py": None, + "../../../../test/distributed/test_c10d_logger.py": None, + "../../../../test/distributed/test_c10d_object_collectives.py": ( + # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal! + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_scatter_object_list_cpu", + "test_scatter_object_list_xpu", + ), + "../../../../test/distributed/test_compute_comm_reordering.py": None, + "../../../../test/distributed/test_control_collectives.py": None, + "../../../../test/distributed/test_device_mesh.py": ( + # RuntimeError: Process 1 exited with error code 10 and exception: + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_scatter_1d", + ), + "../../../../test/distributed/test_dynamo_distributed.py": ( + # AssertionError: 'setattr() on Tensor.requires_grad' not found in 'Attempted to call function marked as skipped + # https://github.com/intel/torch-xpu-ops/issues/1667, 2.8 skipped + "test_fsdp_setattr", + ), + "../../../../test/distributed/test_fake_pg.py": None, + "../../../../test/distributed/test_functional_api.py": None, + "../../../../test/distributed/test_inductor_collectives.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1581, 2.8 skipped + # Fatal Python error: Segmentation fault + "test_dynamo_rewrite_dist_all_gather", + "test_dynamo_rewrite_dist_all_gather_list", + "test_dynamo_rewrite_dist_all_gather_args_match", + "test_dynamo_rewrite_dist_reduce_scatter", + "test_dynamo_support_collective_op_with_async_op_False", + "test_dynamo_trace_reduce_scatter_tensor", + "test_dynamo_trace_all_gather_tensor", + "test_dynamo_trace_allgather_coalesced", + "test_inductor_reduce_scatter_coalesced", + "test_inductor_all_gather_coalesced", + "test_reorder_peak_memory", + ), + "../../../../test/distributed/test_multi_threaded_pg.py": None, + "../../../../test/distributed/test_store.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + # ( + # # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504 + # "test_stage_backward_weight_multiple_iters_xpu", + # "test_stage_backward_weight_xpu", + # "test_stage_backward_xpu", + # ), + "../../../../test/distributed/pipelining/test_microbatch.py": None, + # ( + # # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504, need retest with oneccl fix + # "test_chunk_spec_xpu", + # ), + "../../../../test/distributed/pipelining/test_pipe.py": None, + "../../../../test/distributed/pipelining/test_schedule.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, + "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": ( + # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter' + # is not currently implemented for the XPU device + # https://github.com/intel/torch-xpu-ops/issues/1547, 2.8 skipped + "test_dtensor_seq_par_shard_dim_0", + "test_dtensor_seq_par_shard_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......' + # https://github.com/intel/torch-xpu-ops/issues/1548, 2.8 skipped + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True", + # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......' + # https://github.com/intel/torch-xpu-ops/issues/1549, 2.8 skipped + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True", + # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1550, 2.8 skipped + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True", + # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter' + # is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1551, 2.8 skipped + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2", + ), + "../../../../test/distributed/tensor/parallel/test_tp_examples.py": ( + # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! + # https://github.com/intel/torch-xpu-ops/issues/1555, 2.8 skipped + "test_transformer_req_grad_seq_parallel_float32_thaw_all", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings", + "test_transformer_training_is_seq_parallel_False_float32", + "test_transformer_training_is_seq_parallel_True_float32", + # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered. + # https://github.com/intel/torch-xpu-ops/issues/1556, 2.8 skipped + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_loss_parallel", + "test_mlp_training_is_seq_parallel_False_recompute_activation_False", + "test_mlp_training_is_seq_parallel_True_recompute_activation_False", + "test_transformer_req_grad_float64_thaw_all", + "test_transformer_training_is_seq_parallel_False_float64", + "test_transformer_training_is_seq_parallel_True_float64", + ), + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_linear_col_wise_parallel", + "test_parallelize_mlp_with_module_api", + "test_parallelize_mlp_with_module_api_nested", + "test_parallelize_module_multi_wildcard", + "test_parallelize_module_src_data_rank", + "test_parallelize_module_with_digit", + "test_parallelize_module_with_question", + "test_parallelize_module_with_star", + "test_under_devicemesh_context", + ), + "../../../../test/distributed/tensor/parallel/test_tp_style.py": None, + "../../../../test/distributed/tensor/test_api.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_distribute_tensor_rank", + "test_distribute_tensor_uneven_sharding", + ), + "../../../../test/distributed/tensor/test_attention.py": None, + "../../../../test/distributed/tensor/test_common_rules.py": None, + "../../../../test/distributed/tensor/test_dtensor.py": None, + "../../../../test/distributed/tensor/test_dtensor_compile.py": None, + "../../../../test/distributed/tensor/test_experimental_ops.py": None, + "../../../../test/distributed/tensor/test_init.py": None, + "../../../../test/distributed/tensor/test_math_ops.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_cumsum", + "test_layer_norm_bwd", + "test_layer_norm_bwd_req_grad", + "test_layer_norm_fwd", + "test_linear_op_reductions", + "test_shard0_svd", + "test_softmax_fwd", + "test_topk", + ), + "../../../../test/distributed/tensor/test_random_ops.py": None, + "../../../../test/distributed/tensor/test_redistribute.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_redistribute_shard_dim_change", + "test_redistribute_uneven_sharding", + "test_shard_to_replicate_forward_backward", + "test_shard_to_replicate_forward_backward_datatype_conversion", + "test_multi_dim_mesh", + ), + "../../../../test/distributed/tensor/test_tensor_ops.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_aten_contiguous", + "test_gather", + "test_index", + "test_slice", + "test_stack", + "test_where_type_promotion", + ), + "../../../../test/distributed/tensor/experimental/test_register_sharding.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_argmax", + "test_softmax_fwd", + ), + # FSDP2 + "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_nontensor_activations", + "test_unused_forward_module", + "test_unused_forward_output", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": ( + # ValueError: Cannot use ReduceOp.PREMUL_SUM with XCCL + # https://github.com/intel/torch-xpu-ops/issues/1571, 2.8 skipped + "test_set_reduce_scatter_divide_factor", + # NO related environment variable on XPU + "test_fully_shard_force_sum_both_reductions", + "test_fully_shard_force_sum_reduce_scatter", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": ( + # torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised + # https://github.com/intel/torch-xpu-ops/issues/1665, 2.8 skipped + "test_transformer_backend_inductor_fullgraph_True", + "test_nested_fully_shard_backend_inductor_fullgraph_True", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_train_mixed_requires_grad_per_group", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_compute_dtype", + "test_grad_acc_with_reduce_dtype", + "test_reduce_dtype", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": ( + # Performance test, should skip + "test_fully_shard_training_overlap", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": ( + # checkpointing issue, 2.8 skipped + "test_train_parity_with_activation_checkpointing", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_1f1b_microbatching", + "test_gradient_accumulation", + ), + "../../../../test/distributed/_composable/test_replicate_with_compiler.py": ( + # AssertionError: Tensor-likes are not close! + # https://github.com/intel/torch-xpu-ops/issues/1668, 2.8 skipped + "test_compile_backward_only", + "test_compile_bf16", + "test_compile_fp16", + "test_compile_gpu", + "test_compile_gpu_ac", + ), + "../../../../test/distributed/_shard/test_sharder.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_custom_sharder", + ), + "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None, + "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": ( + # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) + # https://github.com/intel/torch-xpu-ops/issues/1617, 2.8 skipped + "test_complete_world_size", + "test_multiple_local_shards", + "test_new_group", + "test_partial_world_size", + "test_grid_sharding", + "test_multiple_local_shards", + "test_new_group", + "test_partial_world_size", + "test_with_rpc_names", + "test_init_from_local_tensor", + # what(): Attempting to send a Tensor with unexpected device type xpu:3 + # https://github.com/intel/torch-xpu-ops/issues/1616, 2.8 skipped + "test_init_from_local_shards", + "test_init_from_local_shards_and_global_metadata", + ), + "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None, + "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None, + "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None, + "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None, + # ( + # # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # # https://github.com/intel/torch-xpu-ops/issues/1508, 2.8 skipped + # "test_tracker_with_activation_checkpointing", + # ), + "../../../../test/distributed/_tools/test_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_memory_tracker.py": None, + "../../../../test/distributed/_tools/test_mod_tracker.py": None, +} + +skip_dict_python = { + "distributed/test_c10d_ops_xccl.py": None, + "distributed/test_c10d_xccl.py": None, + "../../../../test/distributed/test_c10d_functional_native.py": None, + # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_stage.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, +} diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index df524100b..447fc5f43 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1163,6 +1163,7 @@ def copy_tests( def launch_test(test_case, skip_list=None, exe_list=None): os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1" os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" + rename = test_case.replace("../../../../", "").replace("/", "_") if skip_list is not None: skip_options = ' -k "not ' + skip_list[0] for skip_case in skip_list[1:]: @@ -1170,8 +1171,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " - + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) test_command += skip_options elif exe_list is not None: @@ -1181,13 +1181,11 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " - + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) test_command += exe_options else: test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " - + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) - return os.system(test_command) + return os.system(test_command) \ No newline at end of file