From a2c3f35f047e6b711d3ccf4983856482563b1c13 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 06:01:16 -0700 Subject: [PATCH 01/83] enable fsdp cases based on local branch --- test/xpu/run_distributed_local.py | 63 +++++++++++++++++++++++++++++++ test/xpu/skip_list_dist_local.py | 57 ++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 test/xpu/run_distributed_local.py create mode 100644 test/xpu/skip_list_dist_local.py diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py new file mode 100644 index 000000000..8074b3292 --- /dev/null +++ b/test/xpu/run_distributed_local.py @@ -0,0 +1,63 @@ +import os +import subprocess +import sys + +from skip_list_dist_local import skip_dict +from xpu_test_utils import launch_test + +res = 0 +fail_test = [] + +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log", "r") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(' ') + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split('/')[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ( "XL" in item or "S" in item ): + if len(affinity) == 0: + affinity = str(j-2) + else: + affinity = affinity + ',' + str(j-2) + gpu_dict[i] = affinity + + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) + +# run pytest with skiplist +for key in skip_dict: + skip_list = skip_dict[key] + fail = launch_test(key, skip_list) + res += fail + if fail: + fail_test.append(key) + +if fail_test: + print(",".join(fail_test) + " have failures") + +exit_code = os.WEXITSTATUS(res) +if exit_code == 0: + sys.exit(res) +else: + sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py new file mode 100644 index 000000000..08f90c6b5 --- /dev/null +++ b/test/xpu/skip_list_dist_local.py @@ -0,0 +1,57 @@ +skip_dict = { + "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1536 + #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "test_distributed_checkpoint_state_dict_type0_xpu", + # "test_distributed_checkpoint_state_dict_type1_xpu", + #), + "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, + "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + "test_delayed_optim_step_offload_true_no_shard_xpu", + "test_transformer_no_grad_mixed_precision_True_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None, + "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, + "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, + "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, + "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, + "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + "test_fsdp_zero2_eval_with_prefetch", + ), + "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1537 + "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( + "test_use_orig_params", + ), + # Performance check, skip + #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # "test_forward_overlap", + # "test_forward_overlap_xpu", + #), + "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, + "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, + "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, + "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None, + "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_shard_utils.py": None, + "../../../../test/distributed/fsdp/test_utils.py": None, + "../../../../test/distributed/fsdp/test_wrap.py": None, +} From e772d23680c67301d6e9e5a47b741fc622c49158 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 19:46:24 -0700 Subject: [PATCH 02/83] add 2025.0 WA --- test/xpu/run_distributed_local.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 8074b3292..b6a9ef60c 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -8,6 +8,9 @@ res = 0 fail_test = [] +os.environ["CCL_ATL_TRANSPORT"] = "ofi" +os.environ["CCL_SEND"] = "direct" +os.environ["CCL_RECV"] = "direct" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: From cbd34cd308e4cd601561c3ce64e44c408b94f730 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 01:27:44 -0700 Subject: [PATCH 03/83] Update distributed UT cases in DDP and PP Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 29 +++++++++- test/xpu/skip_list_dist_local.py | 91 +++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index b6a9ef60c..982f05409 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -2,15 +2,17 @@ import subprocess import sys -from skip_list_dist_local import skip_dict +from skip_list_dist_local import skip_dict, skip_dict_python from xpu_test_utils import launch_test res = 0 +res2 = 0 fail_test = [] os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" os.environ["CCL_RECV"] = "direct" +os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: @@ -48,6 +50,29 @@ print("xpu-smi topology failed") sys.exit(255) +# run python test +def run(test_command): + result = subprocess.run(test_command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + return result + +for key in skip_dict_python: + skip_list = skip_dict_python[key] + test_command = ["python", key] + fail = run(test_command) + if fail.returncode: + for line in fail.stderr.split("\n"): + if "FAIL: " in line: + is_error = True + for skip_case in skip_list: + if skip_case in line: + print("Skiped error: ", key + " " + skip_case) + is_error = False + if is_error: + res2 += fail.returncode + fail_test.append("".join(key + " " + line)) + # run pytest with skiplist for key in skip_dict: skip_list = skip_dict[key] @@ -61,6 +86,6 @@ exit_code = os.WEXITSTATUS(res) if exit_code == 0: - sys.exit(res) + sys.exit(res2) else: sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 08f90c6b5..d65b7aee6 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -54,4 +54,95 @@ "../../../../test/distributed/fsdp/test_shard_utils.py": None, "../../../../test/distributed/fsdp/test_utils.py": None, "../../../../test/distributed/fsdp/test_wrap.py": None, + "../../../../test/distributed/test_backends.py": None, + "../../../../test/distributed/test_c10d_common.py": None, + "../../../../test/distributed/test_c10d_functional_native.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + "test_reduce_scatter_tensor_coalesced", + "test_reduce_scatter_tensor_single", + # https://github.com/intel/torch-xpu-ops/issues/1525 + # ValueError: trying to initialize the default process group twice! + "test_inductor_all_gather_into_tensor_coalesced", + "test_inductor_all_gather_into_tensor_single", + "test_inductor_all_reduce_coalesced", + "test_inductor_all_reduce_non_contig_input", + "test_inductor_all_reduce_single", + "test_inductor_all_to_all_single", + "test_inductor_broadcast", + "test_inductor_inplace_op_on_view", + "test_inductor_reduce_scatter_tensor_coalesced", + "test_inductor_reduce_scatter_tensor_single", + "test_inductor_reuse_buffer_after_inplace_collective", + "test_ranks_and_tag", + "test_wait_tensor", + ), + "../../../../test/distributed/test_c10d_logger.py": None, + "../../../../test/distributed/test_c10d_object_collectives.py": ( + # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_gather_object_cpu", + "test_gather_object_xpu", + "test_gather_object_list_cpu", + "test_gather_object_list_xpu", + ), + "../../../../test/distributed/test_compute_comm_reordering.py": None, + "../../../../test/distributed/test_control_collectives.py": None, + "../../../../test/distributed/test_device_mesh.py": None, + "../../../../test/distributed/test_dynamo_distributed.py": ( + # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' + "test_asymmetric_compilation", + "test_asymmetric_compilation_with_fx_cache", + # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. + "test_compiled_flex_attention_full_model_ddp", + "test_compiled_flex_attention_local_ddp", + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # https://github.com/intel/torch-xpu-ops/issues/1527 + "test_compiler_collectives_automatic_dynamic_scalar", + "test_compiler_collectives_automatic_dynamic_speculation_divergence", + "test_compiler_collectives_automatic_dynamic_tensor", + "test_compiler_collectives_dim_mismatch", + "test_compiler_collectives_graph_break_empty_graph_still_collective", + "test_compiler_collectives_missing_source", + "test_compiler_collectives_scalar_missing_source", + "test_compiler_collectives_type_mismatch", + "test_ddp_activation_checkpointing", + "test_ddp_baseline_aot_eager_multiprocess", + "test_fsdp_activation_checkpointing", + "test_fsdp_aot_eager", + "test_fsdp_inductor", + "test_fsdp_setattr", + "test_fsdp_unspecialized_forced_getattr_inline", + "test_fsdp_unspecialized_forced_getattr_no_inline", + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_get_pg_attr", + ), + "../../../../test/distributed/test_fake_pg.py": None, + "../../../../test/distributed/test_functional_api.py": ( + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_tracing_xpu", + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu" + ), + "../../../../test/distributed/test_multi_threaded_pg.py": ( + # oneccl not support multi-threaded well, so skip it first. + "test_bwd_sees_fwd_pg", + ), + "../../../../test/distributed/test_store.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + "../../../../test/distributed/pipelining/test_microbatch.py": None, + "../../../../test/distributed/pipelining/test_pipe.py": None, + "../../../../test/distributed/pipelining/test_schedule.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, + "../../../../test/distributed/pipelining/test_unflatten.py": None, +} + +skip_dict_python = { + "distributed/test_c10d_ops_xccl.py": None, + "distributed/test_c10d_xccl.py": None, + "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_stage.py": None, } From d856e950310ed44446d81d9b37250b7b7d4fbcc3 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 01:36:16 -0700 Subject: [PATCH 04/83] Fixed pylint error Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 7 +++---- test/xpu/skip_list_dist_local.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 982f05409..a5f0c8098 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -11,7 +11,7 @@ os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" -os.environ["CCL_RECV"] = "direct" +os.environ["CCL_RECV"] = "direct" os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") @@ -36,13 +36,12 @@ else: affinity = affinity + ',' + str(j-2) gpu_dict[i] = affinity - - + max_affinity = "" for key, value in gpu_dict.items(): if len(value) > len(max_affinity): max_affinity = value - + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index d65b7aee6..6ce62b8ca 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -96,7 +96,7 @@ # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. "test_compiled_flex_attention_full_model_ddp", "test_compiled_flex_attention_local_ddp", - # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ # https://github.com/intel/torch-xpu-ops/issues/1527 "test_compiler_collectives_automatic_dynamic_scalar", "test_compiler_collectives_automatic_dynamic_speculation_divergence", @@ -131,13 +131,13 @@ ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, "../../../../test/distributed/pipelining/test_microbatch.py": None, "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, } skip_dict_python = { From 28a259e59448bb70958a818d3f50fee62f2ebfa2 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 02:01:55 -0700 Subject: [PATCH 05/83] Fixed pylint error Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 38 ++++++++++++++++--------------- test/xpu/skip_list_dist_local.py | 17 +++++++------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index a5f0c8098..d4db4785a 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -17,29 +17,29 @@ ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: gpu_dict = {} - with open("topology.log", "r") as file: + with open("topology.log") as file: lines = file.readlines() for line in lines: - if "CPU Affinity" in line: - continue - line = line.strip() - if line.startswith("GPU "): - items = line.split(' ') - items = [x for x in items if x] - gpu_id = items[1] - i = gpu_id.split('/')[0] - affinity = "" - for j, item in enumerate(items): - if "SYS" not in item and ( "XL" in item or "S" in item ): - if len(affinity) == 0: - affinity = str(j-2) - else: - affinity = affinity + ',' + str(j-2) - gpu_dict[i] = affinity + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity max_affinity = "" for key, value in gpu_dict.items(): - if len(value) > len(max_affinity): + if len(value) > len(max_affinity): max_affinity = value os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) @@ -49,6 +49,7 @@ print("xpu-smi topology failed") sys.exit(255) + # run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) @@ -56,6 +57,7 @@ def run(test_command): print(result.stderr) return result + for key in skip_dict_python: skip_list = skip_dict_python[key] test_command = ["python", key] diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 6ce62b8ca..0ac46961e 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -1,10 +1,10 @@ skip_dict = { "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, # https://github.com/intel/torch-xpu-ops/issues/1536 - #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( # "test_distributed_checkpoint_state_dict_type0_xpu", # "test_distributed_checkpoint_state_dict_type1_xpu", - #), + # ), "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, @@ -37,11 +37,11 @@ "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( "test_use_orig_params", ), - # Performance check, skip - #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # Performance check, skip + # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( # "test_forward_overlap", # "test_forward_overlap_xpu", - #), + # ), "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, @@ -58,7 +58,7 @@ "../../../../test/distributed/test_c10d_common.py": None, "../../../../test/distributed/test_c10d_functional_native.py": ( # https://github.com/intel/torch-xpu-ops/issues/1508 - #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path "test_reduce_scatter_tensor_coalesced", "test_reduce_scatter_tensor_single", # https://github.com/intel/torch-xpu-ops/issues/1525 @@ -123,7 +123,7 @@ # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) # https://github.com/intel/torch-xpu-ops/issues/1526 "test_tracing_xpu", - "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu" + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu", ), "../../../../test/distributed/test_multi_threaded_pg.py": ( # oneccl not support multi-threaded well, so skip it first. @@ -131,7 +131,6 @@ ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/pipelining/test_backward.py": None, "../../../../test/distributed/pipelining/test_microbatch.py": None, "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, @@ -143,6 +142,6 @@ skip_dict_python = { "distributed/test_c10d_ops_xccl.py": None, "distributed/test_c10d_xccl.py": None, - "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, } From 62e9ff75ced8a311c1e52c61fd49c97622075378 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:25:50 +0000 Subject: [PATCH 06/83] add distributed ut in CI --- .github/scripts/ut_result_check.sh | 10 +-- .github/workflows/_linux_build.yml | 6 +- .github/workflows/_linux_ut.yml | 140 +++++++++++++++++++++++++++++ .github/workflows/pull.yml | 25 ++++++ 4 files changed, 175 insertions(+), 6 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 3fb1a1997..32dbed489 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log - num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log") +if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then + grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log") echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite} xpu distributed" echo -e "=========================================================================" - cat "./${ut_suite}_xpu_distributed_test_failed.log" - ((num_failed=num_failed_xpu_distributed)) + cat "./${ut_suite}_test_failed.log" + ((num_failed=num_failed_distributed)) if [[ $num_failed -gt 0 ]]; then echo -e "[ERROR] UT ${ut_suite} test Fail" exit 1 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index ee9381c9c..3ed1c3d4e 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -70,7 +70,11 @@ jobs: source activate xpu_build cd ../ && rm -rf pytorch pip install requests - git clone https://github.com/pytorch/pytorch pytorch + if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then + git clone https://github.com/daisyden/pytorch.git pytorch + else + git clone https://github.com/pytorch/pytorch pytorch + fi cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) # apply PRs for stock pytorch python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index a11528a3e..aa631c6dd 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -435,3 +435,143 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-xpu_distributed path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [[ ${{ inputs.abi }} == '0' ]]; then + export _GLIBCXX_USE_CXX11_ABI=0 + else + export _GLIBCXX_USE_CXX11_ABI=1 + fi + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index ec2a73a20..9cf7ef458 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -66,6 +66,31 @@ jobs: pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu + + preci-linux-build-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-lint-check + permissions: + issues: write + uses: ./.github/workflows/_linux_build.yml + with: + pytorch: distributed_2.8 + runner: pvc_e2e + + preci-ut-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-linux-build-distributed + uses: ./.github/workflows/_linux_ut.yml + with: + pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} + ut: pytorch_distributed + runner: pvc_e2e Inductor-XPU-E2E-CI-Tests: name: preci-linux / e2e_test From 119d2fb5b20a32990eeb0377ce490f2fe3f89894 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:52:17 +0000 Subject: [PATCH 07/83] update if condition --- .github/workflows/_linux_build.yml | 26 ++++++++++++++------------ .github/workflows/_linux_ut.yml | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 3ed1c3d4e..eda5de367 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -72,20 +72,22 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - fi - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi fi - name: Build Pytorch XPU run: | diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index aa631c6dd..907c5cd2a 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -49,7 +49,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' }} + if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }} timeout-minutes: 900 env: NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} From 5ff20baae6dba5dee9d6c2ea83773a436229e299 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 23:02:20 +0000 Subject: [PATCH 08/83] keep_torch_xpu_ops --- .github/workflows/pull.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 9cf7ef458..f0b1b8e22 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -78,6 +78,7 @@ jobs: uses: ./.github/workflows/_linux_build.yml with: pytorch: distributed_2.8 + keep_torch_xpu_ops: true runner: pvc_e2e preci-ut-distributed: From cc472d7823415596734eb9c7e7afb0a3b8c7203b Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sun, 6 Apr 2025 19:24:08 +0000 Subject: [PATCH 09/83] update keyword in distributed ut check --- .github/scripts/ut_result_check.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 32dbed489..9bf611786 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -73,10 +73,10 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then fi fi if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then - grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log") echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite} xpu distributed" + echo -e "Show Failed cases in ${ut_suite}" echo -e "=========================================================================" cat "./${ut_suite}_test_failed.log" ((num_failed=num_failed_distributed)) From 60dbd6eb19a407058eb5f1e6c4972df7fed94fe1 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 13:37:10 +0000 Subject: [PATCH 10/83] update pytorch build --- .github/workflows/_linux_build.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index eda5de367..3ed1c3d4e 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -72,22 +72,20 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - fi + fi + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt fi - name: Build Pytorch XPU run: | From af0bca95baf745631876e918dfd4ab6b6823778c Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 06:01:16 -0700 Subject: [PATCH 11/83] enable fsdp cases based on local branch --- test/xpu/run_distributed_local.py | 63 +++++++++++++++++++++++++++++++ test/xpu/skip_list_dist_local.py | 57 ++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 test/xpu/run_distributed_local.py create mode 100644 test/xpu/skip_list_dist_local.py diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py new file mode 100644 index 000000000..8074b3292 --- /dev/null +++ b/test/xpu/run_distributed_local.py @@ -0,0 +1,63 @@ +import os +import subprocess +import sys + +from skip_list_dist_local import skip_dict +from xpu_test_utils import launch_test + +res = 0 +fail_test = [] + +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log", "r") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(' ') + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split('/')[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ( "XL" in item or "S" in item ): + if len(affinity) == 0: + affinity = str(j-2) + else: + affinity = affinity + ',' + str(j-2) + gpu_dict[i] = affinity + + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) + +# run pytest with skiplist +for key in skip_dict: + skip_list = skip_dict[key] + fail = launch_test(key, skip_list) + res += fail + if fail: + fail_test.append(key) + +if fail_test: + print(",".join(fail_test) + " have failures") + +exit_code = os.WEXITSTATUS(res) +if exit_code == 0: + sys.exit(res) +else: + sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py new file mode 100644 index 000000000..08f90c6b5 --- /dev/null +++ b/test/xpu/skip_list_dist_local.py @@ -0,0 +1,57 @@ +skip_dict = { + "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1536 + #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "test_distributed_checkpoint_state_dict_type0_xpu", + # "test_distributed_checkpoint_state_dict_type1_xpu", + #), + "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, + "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + "test_delayed_optim_step_offload_true_no_shard_xpu", + "test_transformer_no_grad_mixed_precision_True_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None, + "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, + "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, + "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, + "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, + "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + "test_fsdp_zero2_eval_with_prefetch", + ), + "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1537 + "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( + "test_use_orig_params", + ), + # Performance check, skip + #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # "test_forward_overlap", + # "test_forward_overlap_xpu", + #), + "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, + "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, + "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, + "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None, + "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_shard_utils.py": None, + "../../../../test/distributed/fsdp/test_utils.py": None, + "../../../../test/distributed/fsdp/test_wrap.py": None, +} From 6885a00cdf79029a72ff85938bdf330937ada7e4 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 19:46:24 -0700 Subject: [PATCH 12/83] add 2025.0 WA --- test/xpu/run_distributed_local.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 8074b3292..b6a9ef60c 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -8,6 +8,9 @@ res = 0 fail_test = [] +os.environ["CCL_ATL_TRANSPORT"] = "ofi" +os.environ["CCL_SEND"] = "direct" +os.environ["CCL_RECV"] = "direct" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: From cd013d7882b28620cf0b81aace3f212bcbedaca9 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 01:27:44 -0700 Subject: [PATCH 13/83] Update distributed UT cases in DDP and PP Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 29 +++++++++- test/xpu/skip_list_dist_local.py | 91 +++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index b6a9ef60c..982f05409 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -2,15 +2,17 @@ import subprocess import sys -from skip_list_dist_local import skip_dict +from skip_list_dist_local import skip_dict, skip_dict_python from xpu_test_utils import launch_test res = 0 +res2 = 0 fail_test = [] os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" os.environ["CCL_RECV"] = "direct" +os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: @@ -48,6 +50,29 @@ print("xpu-smi topology failed") sys.exit(255) +# run python test +def run(test_command): + result = subprocess.run(test_command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + return result + +for key in skip_dict_python: + skip_list = skip_dict_python[key] + test_command = ["python", key] + fail = run(test_command) + if fail.returncode: + for line in fail.stderr.split("\n"): + if "FAIL: " in line: + is_error = True + for skip_case in skip_list: + if skip_case in line: + print("Skiped error: ", key + " " + skip_case) + is_error = False + if is_error: + res2 += fail.returncode + fail_test.append("".join(key + " " + line)) + # run pytest with skiplist for key in skip_dict: skip_list = skip_dict[key] @@ -61,6 +86,6 @@ exit_code = os.WEXITSTATUS(res) if exit_code == 0: - sys.exit(res) + sys.exit(res2) else: sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 08f90c6b5..d65b7aee6 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -54,4 +54,95 @@ "../../../../test/distributed/fsdp/test_shard_utils.py": None, "../../../../test/distributed/fsdp/test_utils.py": None, "../../../../test/distributed/fsdp/test_wrap.py": None, + "../../../../test/distributed/test_backends.py": None, + "../../../../test/distributed/test_c10d_common.py": None, + "../../../../test/distributed/test_c10d_functional_native.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + "test_reduce_scatter_tensor_coalesced", + "test_reduce_scatter_tensor_single", + # https://github.com/intel/torch-xpu-ops/issues/1525 + # ValueError: trying to initialize the default process group twice! + "test_inductor_all_gather_into_tensor_coalesced", + "test_inductor_all_gather_into_tensor_single", + "test_inductor_all_reduce_coalesced", + "test_inductor_all_reduce_non_contig_input", + "test_inductor_all_reduce_single", + "test_inductor_all_to_all_single", + "test_inductor_broadcast", + "test_inductor_inplace_op_on_view", + "test_inductor_reduce_scatter_tensor_coalesced", + "test_inductor_reduce_scatter_tensor_single", + "test_inductor_reuse_buffer_after_inplace_collective", + "test_ranks_and_tag", + "test_wait_tensor", + ), + "../../../../test/distributed/test_c10d_logger.py": None, + "../../../../test/distributed/test_c10d_object_collectives.py": ( + # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_gather_object_cpu", + "test_gather_object_xpu", + "test_gather_object_list_cpu", + "test_gather_object_list_xpu", + ), + "../../../../test/distributed/test_compute_comm_reordering.py": None, + "../../../../test/distributed/test_control_collectives.py": None, + "../../../../test/distributed/test_device_mesh.py": None, + "../../../../test/distributed/test_dynamo_distributed.py": ( + # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' + "test_asymmetric_compilation", + "test_asymmetric_compilation_with_fx_cache", + # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. + "test_compiled_flex_attention_full_model_ddp", + "test_compiled_flex_attention_local_ddp", + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # https://github.com/intel/torch-xpu-ops/issues/1527 + "test_compiler_collectives_automatic_dynamic_scalar", + "test_compiler_collectives_automatic_dynamic_speculation_divergence", + "test_compiler_collectives_automatic_dynamic_tensor", + "test_compiler_collectives_dim_mismatch", + "test_compiler_collectives_graph_break_empty_graph_still_collective", + "test_compiler_collectives_missing_source", + "test_compiler_collectives_scalar_missing_source", + "test_compiler_collectives_type_mismatch", + "test_ddp_activation_checkpointing", + "test_ddp_baseline_aot_eager_multiprocess", + "test_fsdp_activation_checkpointing", + "test_fsdp_aot_eager", + "test_fsdp_inductor", + "test_fsdp_setattr", + "test_fsdp_unspecialized_forced_getattr_inline", + "test_fsdp_unspecialized_forced_getattr_no_inline", + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_get_pg_attr", + ), + "../../../../test/distributed/test_fake_pg.py": None, + "../../../../test/distributed/test_functional_api.py": ( + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_tracing_xpu", + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu" + ), + "../../../../test/distributed/test_multi_threaded_pg.py": ( + # oneccl not support multi-threaded well, so skip it first. + "test_bwd_sees_fwd_pg", + ), + "../../../../test/distributed/test_store.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + "../../../../test/distributed/pipelining/test_microbatch.py": None, + "../../../../test/distributed/pipelining/test_pipe.py": None, + "../../../../test/distributed/pipelining/test_schedule.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, + "../../../../test/distributed/pipelining/test_unflatten.py": None, +} + +skip_dict_python = { + "distributed/test_c10d_ops_xccl.py": None, + "distributed/test_c10d_xccl.py": None, + "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_stage.py": None, } From cd92f232de04270a17571df0989be7f32f679fcf Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 01:36:16 -0700 Subject: [PATCH 14/83] Fixed pylint error Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 7 +++---- test/xpu/skip_list_dist_local.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 982f05409..a5f0c8098 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -11,7 +11,7 @@ os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" -os.environ["CCL_RECV"] = "direct" +os.environ["CCL_RECV"] = "direct" os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") @@ -36,13 +36,12 @@ else: affinity = affinity + ',' + str(j-2) gpu_dict[i] = affinity - - + max_affinity = "" for key, value in gpu_dict.items(): if len(value) > len(max_affinity): max_affinity = value - + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index d65b7aee6..6ce62b8ca 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -96,7 +96,7 @@ # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. "test_compiled_flex_attention_full_model_ddp", "test_compiled_flex_attention_local_ddp", - # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ # https://github.com/intel/torch-xpu-ops/issues/1527 "test_compiler_collectives_automatic_dynamic_scalar", "test_compiler_collectives_automatic_dynamic_speculation_divergence", @@ -131,13 +131,13 @@ ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, "../../../../test/distributed/pipelining/test_microbatch.py": None, "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, } skip_dict_python = { From 413c2b09b48eba42bfc67ed70fb03973edef50a5 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 02:01:55 -0700 Subject: [PATCH 15/83] Fixed pylint error Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 38 ++++++++++++++++--------------- test/xpu/skip_list_dist_local.py | 17 +++++++------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index a5f0c8098..d4db4785a 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -17,29 +17,29 @@ ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: gpu_dict = {} - with open("topology.log", "r") as file: + with open("topology.log") as file: lines = file.readlines() for line in lines: - if "CPU Affinity" in line: - continue - line = line.strip() - if line.startswith("GPU "): - items = line.split(' ') - items = [x for x in items if x] - gpu_id = items[1] - i = gpu_id.split('/')[0] - affinity = "" - for j, item in enumerate(items): - if "SYS" not in item and ( "XL" in item or "S" in item ): - if len(affinity) == 0: - affinity = str(j-2) - else: - affinity = affinity + ',' + str(j-2) - gpu_dict[i] = affinity + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity max_affinity = "" for key, value in gpu_dict.items(): - if len(value) > len(max_affinity): + if len(value) > len(max_affinity): max_affinity = value os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) @@ -49,6 +49,7 @@ print("xpu-smi topology failed") sys.exit(255) + # run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) @@ -56,6 +57,7 @@ def run(test_command): print(result.stderr) return result + for key in skip_dict_python: skip_list = skip_dict_python[key] test_command = ["python", key] diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 6ce62b8ca..0ac46961e 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -1,10 +1,10 @@ skip_dict = { "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, # https://github.com/intel/torch-xpu-ops/issues/1536 - #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( # "test_distributed_checkpoint_state_dict_type0_xpu", # "test_distributed_checkpoint_state_dict_type1_xpu", - #), + # ), "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, @@ -37,11 +37,11 @@ "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( "test_use_orig_params", ), - # Performance check, skip - #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # Performance check, skip + # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( # "test_forward_overlap", # "test_forward_overlap_xpu", - #), + # ), "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, @@ -58,7 +58,7 @@ "../../../../test/distributed/test_c10d_common.py": None, "../../../../test/distributed/test_c10d_functional_native.py": ( # https://github.com/intel/torch-xpu-ops/issues/1508 - #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path "test_reduce_scatter_tensor_coalesced", "test_reduce_scatter_tensor_single", # https://github.com/intel/torch-xpu-ops/issues/1525 @@ -123,7 +123,7 @@ # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) # https://github.com/intel/torch-xpu-ops/issues/1526 "test_tracing_xpu", - "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu" + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu", ), "../../../../test/distributed/test_multi_threaded_pg.py": ( # oneccl not support multi-threaded well, so skip it first. @@ -131,7 +131,6 @@ ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/pipelining/test_backward.py": None, "../../../../test/distributed/pipelining/test_microbatch.py": None, "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, @@ -143,6 +142,6 @@ skip_dict_python = { "distributed/test_c10d_ops_xccl.py": None, "distributed/test_c10d_xccl.py": None, - "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, } From ab68eeef12b5546c9d5ff7000b222442ce88ca3f Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:25:50 +0000 Subject: [PATCH 16/83] add distributed ut in CI --- .github/scripts/ut_result_check.sh | 10 +-- .github/workflows/_linux_build.yml | 6 +- .github/workflows/_linux_ut.yml | 140 +++++++++++++++++++++++++++++ .github/workflows/pull.yml | 25 ++++++ 4 files changed, 175 insertions(+), 6 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 3fb1a1997..32dbed489 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log - num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log") +if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then + grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log") echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite} xpu distributed" echo -e "=========================================================================" - cat "./${ut_suite}_xpu_distributed_test_failed.log" - ((num_failed=num_failed_xpu_distributed)) + cat "./${ut_suite}_test_failed.log" + ((num_failed=num_failed_distributed)) if [[ $num_failed -gt 0 ]]; then echo -e "[ERROR] UT ${ut_suite} test Fail" exit 1 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index b67be9f29..f17d02a0c 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -65,7 +65,11 @@ jobs: source activate xpu_build cd ../ && rm -rf pytorch pip install requests - git clone https://github.com/pytorch/pytorch pytorch + if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then + git clone https://github.com/daisyden/pytorch.git pytorch + else + git clone https://github.com/pytorch/pytorch pytorch + fi cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) # apply PRs for stock pytorch python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index e2e21bbfb..1edd00a7c 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -420,3 +420,143 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [[ ${{ inputs.abi }} == '0' ]]; then + export _GLIBCXX_USE_CXX11_ABI=0 + else + export _GLIBCXX_USE_CXX11_ABI=1 + fi + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3dd204e32..be9d35397 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -66,6 +66,31 @@ jobs: pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu + + preci-linux-build-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-lint-check + permissions: + issues: write + uses: ./.github/workflows/_linux_build.yml + with: + pytorch: distributed_2.8 + runner: pvc_e2e + + preci-ut-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-linux-build-distributed + uses: ./.github/workflows/_linux_ut.yml + with: + pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} + ut: pytorch_distributed + runner: pvc_e2e Inductor-XPU-E2E-CI-Tests: name: preci-linux / e2e_test From c5ec1405e405404d2f3f991d8ffbc213f6f2da5a Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:52:17 +0000 Subject: [PATCH 17/83] update if condition --- .github/workflows/_linux_build.yml | 26 ++++++++++++++------------ .github/workflows/_linux_ut.yml | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index f17d02a0c..e31d1e27b 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -67,20 +67,22 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - fi - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi fi - name: Build Pytorch XPU run: | diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 1edd00a7c..94dacaf54 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -44,7 +44,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' }} + if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }} timeout-minutes: 900 env: NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} From edc9e1b5bcde0adf04d47a634ab413cbae41c05a Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 23:02:20 +0000 Subject: [PATCH 18/83] keep_torch_xpu_ops --- .github/workflows/pull.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index be9d35397..eec6b2893 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -78,6 +78,7 @@ jobs: uses: ./.github/workflows/_linux_build.yml with: pytorch: distributed_2.8 + keep_torch_xpu_ops: true runner: pvc_e2e preci-ut-distributed: From 6c9e99adf2288f6652d0ccc8b84749e353800b85 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sun, 6 Apr 2025 19:24:08 +0000 Subject: [PATCH 19/83] update keyword in distributed ut check --- .github/scripts/ut_result_check.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 32dbed489..9bf611786 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -73,10 +73,10 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then fi fi if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then - grep -E "ERROR" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log") echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite} xpu distributed" + echo -e "Show Failed cases in ${ut_suite}" echo -e "=========================================================================" cat "./${ut_suite}_test_failed.log" ((num_failed=num_failed_distributed)) From bdfa8536c16191cede8c9fd5710e1b90a8e526cc Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 13:37:10 +0000 Subject: [PATCH 20/83] update pytorch build --- .github/workflows/_linux_build.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index e31d1e27b..f17d02a0c 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -67,22 +67,20 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - fi + fi + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt fi - name: Build Pytorch XPU run: | From 0e77f3030f4e03c4b2cbadf19e1d3cf7c523d744 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 14:55:26 +0000 Subject: [PATCH 21/83] update if condition --- .github/workflows/_linux_ut.yml | 2 +- .github/workflows/pull.yml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 94dacaf54..deddcc5db 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -44,7 +44,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }} + if: ${{ inputs.ut != 'xpu_distributed' && inputs.ut != 'pytorch_distributed' }} timeout-minutes: 900 env: NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index eec6b2893..be9d35397 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -78,7 +78,6 @@ jobs: uses: ./.github/workflows/_linux_build.yml with: pytorch: distributed_2.8 - keep_torch_xpu_ops: true runner: pvc_e2e preci-ut-distributed: From 4076a1a940d148137f9f530c5efface6ba2365d4 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 18:12:34 +0000 Subject: [PATCH 22/83] resolve Artifact name conflict --- .github/workflows/_linux_build.yml | 4 ++-- .github/workflows/_linux_ut.yml | 15 +++++---------- .github/workflows/pull.yml | 4 ++-- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index f17d02a0c..ae6c2064c 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -171,11 +171,11 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/torch*.whl - name: Upload Build Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/pytorch_*.log diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index deddcc5db..0e8265639 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -95,7 +95,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -339,7 +339,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -474,18 +474,13 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | source activate xpu_op_${ZE_AFFINITY_MASK} source .github/scripts/env.sh ${{ inputs.pytorch }} pip install mkl-static==2025.0.1 mkl-include==2025.0.1 - if [[ ${{ inputs.abi }} == '0' ]]; then - export _GLIBCXX_USE_CXX11_ABI=0 - else - export _GLIBCXX_USE_CXX11_ABI=1 - fi if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then cd ../pytorch export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} @@ -534,7 +529,7 @@ jobs: echo -e "[ERROR] XCCL is not enabled" exit 1 fi - timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log cd ${{ github.workspace }} sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - name: UT Test Results Check @@ -558,5 +553,5 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index be9d35397..0e9ee9f63 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -88,7 +88,7 @@ jobs: needs: preci-linux-build-distributed uses: ./.github/workflows/_linux_ut.yml with: - pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} + pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }} ut: pytorch_distributed runner: pvc_e2e @@ -137,7 +137,7 @@ jobs: if: ${{ inputs.pytorch }} != 'nightly_wheel' uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ needs.preci-linux-build.outputs.torch_commit_id }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | From 5596ac4436e9d6b1b0367915b3d52ea25c408b5b Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Mon, 7 Apr 2025 23:41:37 -0700 Subject: [PATCH 23/83] enabled test_sharder.py on xpu --- test/xpu/skip_list_dist_local.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 0ac46961e..218746b71 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -137,6 +137,7 @@ "../../../../test/distributed/pipelining/test_transformer.py": None, "../../../../test/distributed/pipelining/test_unflatten.py": None, "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/_shard/test_sharder.py": None, } skip_dict_python = { From 2ed797354aab68575dc8c4ee0f746c9eef9eadac Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Wed, 9 Apr 2025 00:18:27 -0700 Subject: [PATCH 24/83] Enabled UT for test/distributed/tensor Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 0ac46961e..42cdebf19 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -136,7 +136,85 @@ "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": ( + # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter' + # is not currently implemented for the XPU device + # https://github.com/intel/torch-xpu-ops/issues/1547 + "test_dtensor_seq_par_shard_dim_0", + "test_dtensor_seq_par_shard_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......' + # https://github.com/intel/torch-xpu-ops/issues/1548 + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True", + # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......' + # https://github.com/intel/torch-xpu-ops/issues/1549 + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True", + # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1550 + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True", + # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter' + # is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1551 + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2", + ), + "../../../../test/distributed/tensor/parallel/test_tp_examples.py": ( + # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! + # https://github.com/intel/torch-xpu-ops/issues/1555 + "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings", + "test_transformer_training_is_seq_parallel_False_float32", + "test_transformer_training_is_seq_parallel_True_float32", + # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered. + # https://github.com/intel/torch-xpu-ops/issues/1556 + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output", + ), "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_style.py": None, + "../../../../test/distributed/tensor/test_api.py": None, + "../../../../test/distributed/tensor/test_attention.py": None, + "../../../../test/distributed/tensor/test_common_rules.py": None, + "../../../../test/distributed/tensor/test_dtensor.py": None, + "../../../../test/distributed/tensor/test_dtensor_compile.py": None, + "../../../../test/distributed/tensor/test_experimental_ops.py": None, + "../../../../test/distributed/tensor/test_init.py": None, + "../../../../test/distributed/tensor/test_math_ops.py": ( + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_mean", + "test_nll_loss_and_cross_entropy", + ), + "../../../../test/distributed/tensor/test_random_ops.py": None, + "../../../../test/distributed/tensor/test_redistribute.py": None, + "../../../../test/distributed/tensor/test_tensor_ops.py": None, + "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None, } skip_dict_python = { @@ -144,4 +222,5 @@ "distributed/test_c10d_xccl.py": None, "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, } From 5bab858cbde56b7319c43690157aee43d06917f3 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 9 Apr 2025 23:57:58 -0700 Subject: [PATCH 25/83] add FSDP2 cases, improved check-ut.py for summary, do ZE_AFFINITY_MASK configuration before import torch --- .github/scripts/check-ut.py | 5 ++++- test/xpu/run_distributed_local.py | 3 ++- test/xpu/skip_list_dist_local.py | 17 +++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 8cd490bc8..9d9e4edfd 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -30,7 +30,8 @@ def get_result(case): def get_message(case): if not case.result: return "" - return f"{case.result[0].message.splitlines()[0]}" + #return f" for line in {case.result[0].message.splitlines()}" + return [item for item in case.result[0].message.splitlines() if "Error:" in item] def print_md_row(row, print_header): if print_header: @@ -75,6 +76,8 @@ def print_suite(suite): category = 'op_extended' elif 'op_ut' in ut: category = 'op_ut' + else: + category = "default" row = { 'Category': category, 'UT': ut, diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index d4db4785a..1c2435e15 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -3,7 +3,6 @@ import sys from skip_list_dist_local import skip_dict, skip_dict_python -from xpu_test_utils import launch_test res = 0 res2 = 0 @@ -50,6 +49,8 @@ sys.exit(255) +from xpu_test_utils import launch_test + # run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 5629046d9..a41c91f18 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -216,6 +216,23 @@ "../../../../test/distributed/tensor/test_tensor_ops.py": None, "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None, "../../../../test/distributed/_shard/test_sharder.py": None, + # FSDP2 + "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": None, } skip_dict_python = { From f1b824d7764ddf88989f1960519a84dc449fbb56 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 10 Apr 2025 01:27:23 -0700 Subject: [PATCH 26/83] Skip test_schedule_multiproc.py for hang error Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 5629046d9..b2984fb17 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -221,7 +221,7 @@ skip_dict_python = { "distributed/test_c10d_ops_xccl.py": None, "distributed/test_c10d_xccl.py": None, - "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, } From f696faad63d48e4a2e65a15340c998aedc9d529d Mon Sep 17 00:00:00 2001 From: Cheng Penghui Date: Mon, 14 Apr 2025 23:14:30 -0700 Subject: [PATCH 27/83] refine error log for test files without pytest Signed-off-by: Cheng Penghui --- test/xpu/run_distributed_local.py | 52 ++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index d4db4785a..96761cd82 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -8,6 +8,7 @@ res = 0 res2 = 0 fail_test = [] +error_log = "" os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" @@ -59,20 +60,49 @@ def run(test_command): for key in skip_dict_python: - skip_list = skip_dict_python[key] + skip_list = skip_dict_python[key] if skip_dict_python[key] else [] test_command = ["python", key] fail = run(test_command) if fail.returncode: - for line in fail.stderr.split("\n"): - if "FAIL: " in line: - is_error = True - for skip_case in skip_list: - if skip_case in line: - print("Skiped error: ", key + " " + skip_case) - is_error = False - if is_error: - res2 += fail.returncode - fail_test.append("".join(key + " " + line)) + num_skipped = 0 + num_err = 0 + for i, err in enumerate(fail.stderr.split("FAIL: ")): + if i == 0 and len(err) > 0: + error_log += err + continue + is_skipped = False + for skip_case in skip_list: + if skip_case in err: + print("Skipped error: ", key + " " + skip_case) + num_skipped += 1 + is_skipped = True + break + if not is_skipped: + num_err += 1 + res2 += fail.returncode + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n") + else: + error_log += (line + "\n") + else: + error_log += ("FAIL: " + err) + else: + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n") + + if num_err > 0: + fail_test.append(key) + renamed_key = key.replace("../../../../", "").replace("/", "_") + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(error_log) # run pytest with skiplist for key in skip_dict: From 00326ac761623a735718105609c6e0cb05686a7c Mon Sep 17 00:00:00 2001 From: Cheng Penghui Date: Tue, 15 Apr 2025 01:50:09 -0700 Subject: [PATCH 28/83] Fixed error for create log file without pytest Signed-off-by: Cheng Penghui --- test/xpu/run_distributed_local.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 46a0be814..46905cef1 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -64,9 +64,9 @@ def run(test_command): skip_list = skip_dict_python[key] if skip_dict_python[key] else [] test_command = ["python", key] fail = run(test_command) + num_skipped = 0 + num_err = 0 if fail.returncode: - num_skipped = 0 - num_err = 0 for i, err in enumerate(fail.stderr.split("FAIL: ")): if i == 0 and len(err) > 0: error_log += err @@ -99,11 +99,16 @@ def run(test_command): num_errs = line.split("=")[1].split(")")[0].strip() error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n") + renamed_key = key.replace("../../../../", "").replace("/", "_") if num_err > 0: fail_test.append(key) - renamed_key = key.replace("../../../../", "").replace("/", "_") with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: f.write(error_log) + else: + import pdb;pdb.set_trace() + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(fail.stdout) + f.write(fail.stderr) # run pytest with skiplist for key in skip_dict: From 59c609e66945c3b4d2dae80a3f909256451be4e3 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Tue, 15 Apr 2025 23:07:01 -0700 Subject: [PATCH 29/83] Skipped cases rasied issue Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 3 - test/xpu/skip_list_dist_local.py | 271 +++++++++++++++++++++++++++--- 2 files changed, 246 insertions(+), 28 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 46a0be814..63a588416 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -9,9 +9,6 @@ fail_test = [] error_log = "" -os.environ["CCL_ATL_TRANSPORT"] = "ofi" -os.environ["CCL_SEND"] = "direct" -os.environ["CCL_RECV"] = "direct" os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index e6a2a34f3..9ec4c59e0 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -7,19 +7,120 @@ # ), "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, - "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, - "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", + "test_checkpoint_submodule_use_reentrant_False_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_ddp_parity_xpu", + ), "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, - "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_bf16_hook_has_wrapping_False_sharding_strategy0", + "test_bf16_hook_has_wrapping_False_sharding_strategy1", + "test_bf16_hook_has_wrapping_False_sharding_strategy2", + "test_bf16_hook_has_wrapping_True_sharding_strategy0", + "test_bf16_hook_has_wrapping_True_sharding_strategy1", + "test_bf16_hook_has_wrapping_True_sharding_strategy2", + "test_fp16_hook_has_wrapping_False_sharding_strategy1", + "test_fp16_hook_has_wrapping_False_sharding_strategy2", + "test_fp16_hook_has_wrapping_True_sharding_strategy0", + "test_fp16_hook_has_wrapping_True_sharding_strategy1", + "test_fp16_hook_has_wrapping_True_sharding_strategy2", + ), "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 "test_delayed_optim_step_offload_true_no_shard_xpu", "test_transformer_no_grad_mixed_precision_True_xpu", + "test_delayed_optim_step_offload_false_no_shard_xpu", + "test_delayed_optim_step_offload_false_none_xpu", + "test_delayed_optim_step_offload_false_shard_grad_op_xpu", + "test_delayed_optim_step_offload_true_none_xpu", + "test_delayed_optim_step_offload_true_shard_grad_op_xpu", + "test_delayed_reduce_scatter_offload_false_no_shard_xpu", + "test_delayed_reduce_scatter_offload_false_none_xpu", + "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu", + "test_delayed_reduce_scatter_offload_true_none_xpu", + "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu", + "test_mixture_of_experts_offload_false_no_shard_xpu", + "test_mixture_of_experts_offload_false_none_xpu", + "test_mixture_of_experts_offload_false_shard_grad_op_xpu", + "test_mixture_of_experts_offload_true_none_xpu", + "test_mixture_of_experts_offload_true_shard_grad_op_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu", + "test_nested_always_wrap_model_offload_false_no_shard_xpu", + "test_nested_always_wrap_model_offload_false_none_xpu", + "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu", + "test_nested_always_wrap_model_offload_true_none_xpu", + "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu", + "test_nested_wrapped_model_offload_false_no_shard_xpu", + "test_nested_wrapped_model_offload_false_none_xpu", + "test_nested_wrapped_model_offload_false_shard_grad_op_xpu", + "test_nested_wrapped_model_offload_true_none_xpu", + "test_nested_wrapped_model_offload_true_shard_grad_op_xpu", + "test_transformer_offload_false_none_xpu", + "test_transformer_offload_false_shard_grad_op_xpu", + "test_transformer_offload_true_none_xpu", + "test_transformer_offload_true_shard_grad_op_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", ), - "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, - "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_hooks_multi_traversal_xpu", + "test_parity_with_ddp_xpu", + "test_parity_with_non_frozen_fsdp_xpu", + ), "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, - "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None, + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True ", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + ), "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, @@ -28,28 +129,89 @@ "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 "test_fsdp_zero2_eval_with_prefetch", + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_fsdp_optimizer_overlap", ), "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, - "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_multi_forward_cpu", + ), "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, # https://github.com/intel/torch-xpu-ops/issues/1537 "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_flatten_sharded_optim_state_dict_nested", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True", + "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_False", + "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_True", + "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_False", + "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_True", + "test_rekey_optim_state_dict_to_names", + "test_scatter_full_optim_state_dict_nested_halve_world_size", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_halve_world_size", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True", "test_use_orig_params", ), # Performance check, skip # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # # https://github.com/intel/torch-xpu-ops/issues/1504 # "test_forward_overlap", # "test_forward_overlap_xpu", # ), "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, - "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, - "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_true_none_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none", + ), + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_state_dict_save_load_flow_state_dict_type_local_state_dict", + "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict", + "test_state_dict_save_load_flow_state_dict_type_state_dict", + ), "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, - "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_diff_hyperparams_sharding_strategy_str_full_shard", + "test_diff_hyperparams_sharding_strategy_str_no_shard", + "test_diff_hyperparams_sharding_strategy_str_shard_grad_op", + "test_no_sync_correctness", + ), "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, "../../../../test/distributed/fsdp/test_shard_utils.py": None, "../../../../test/distributed/fsdp/test_utils.py": None, @@ -127,11 +289,20 @@ ), "../../../../test/distributed/test_multi_threaded_pg.py": ( # oneccl not support multi-threaded well, so skip it first. + # https://github.com/intel/torch-xpu-ops/issues/1509 "test_bwd_sees_fwd_pg", ), "../../../../test/distributed/test_store.py": None, - "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/pipelining/test_microbatch.py": None, + "../../../../test/distributed/pipelining/test_backward.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_stage_backward_weight_multiple_iters_xpu", + "test_stage_backward_weight_xpu", + "test_stage_backward_xpu", + ), + "../../../../test/distributed/pipelining/test_microbatch.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_chunk_spec_xpu", + ), "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, @@ -184,7 +355,7 @@ "../../../../test/distributed/tensor/parallel/test_tp_examples.py": ( # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! # https://github.com/intel/torch-xpu-ops/issues/1555 - "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all", + "test_transformer_req_grad_seq_parallel_float32_thaw_all", "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings", "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings", "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings", @@ -201,9 +372,29 @@ "../../../../test/distributed/tensor/test_api.py": None, "../../../../test/distributed/tensor/test_attention.py": None, "../../../../test/distributed/tensor/test_common_rules.py": None, - "../../../../test/distributed/tensor/test_dtensor.py": None, - "../../../../test/distributed/tensor/test_dtensor_compile.py": None, - "../../../../test/distributed/tensor/test_experimental_ops.py": None, + "../../../../test/distributed/tensor/test_dtensor.py": ( + # Passed with updated test code for world_size 8 + "test_auto_implicit_replication", + "test_default_value_sub_mesh", + "test_device_mesh_nd", + "test_dtensor_2d_mesh", + "test_dtensor_api_device_mesh_context_manager", + "test_dtensor_device_mesh_device_conversion", + "test_dtensor_spec_local_shard_offset", + "test_from_local_sub_mesh", + "test_implicit_replication", + "test_metadata_consistency_check", + "test_redistribute_sub_mesh", + "test_split_tensor_1D", + ), + "../../../../test/distributed/tensor/test_dtensor_compile.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_2d_fsdp_tp_compile", + ), + "../../../../test/distributed/tensor/test_experimental_ops.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_bernoulli", + ), "../../../../test/distributed/tensor/test_init.py": None, "../../../../test/distributed/tensor/test_math_ops.py": ( # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path @@ -211,28 +402,58 @@ "test_mean", "test_nll_loss_and_cross_entropy", ), - "../../../../test/distributed/tensor/test_random_ops.py": None, - "../../../../test/distributed/tensor/test_redistribute.py": None, + "../../../../test/distributed/tensor/test_random_ops.py": ( + # Need to update world size + "test_hsdp_tp_model_meta_init", + ), + "../../../../test/distributed/tensor/test_redistribute.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_redistribute_shard_dim_multi_dim_mesh", + ), "../../../../test/distributed/tensor/test_tensor_ops.py": None, "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None, "../../../../test/distributed/_shard/test_sharder.py": None, # FSDP2 "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_clip_grad_norm_2d", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1571 + "test_set_reduce_scatter_divide_factor", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_gradient_scaler", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_fully_shard_training_memory", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": ( + # Performance test, should skip + "test_fully_shard_training_overlap", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1572 + "test_dp_state_dict_cpu_offload", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_post_optim_event", + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_train_parity_multi_group_unshard_async_op", + "test_train_parity_with_activation_checkpointing", + ), } skip_dict_python = { From de00feb16f2d68bf2f1d752bb3d5458eca4cf223 Mon Sep 17 00:00:00 2001 From: Zhong Ruijie <109201212+RUIJIEZHONG66166@users.noreply.github.com> Date: Wed, 16 Apr 2025 16:49:41 +0800 Subject: [PATCH 30/83] Update ut summary --- .github/scripts/check-ut.py | 302 +++++++++++++++++++++++--------- .github/workflows/_linux_ut.yml | 12 ++ 2 files changed, 235 insertions(+), 79 deletions(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 9d9e4edfd..7e7c6ecd4 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -1,22 +1,47 @@ import argparse import sys import os +import re from junitparser import JUnitXml, Error, Failure, Skipped -parser = argparse.ArgumentParser() -parser.add_argument('junitxml', nargs='+') +parser = argparse.ArgumentParser(description='Test results analyzer') +parser.add_argument('input_files', nargs='+', help='JUnit XML files or log files') args = parser.parse_args() failures = [] -suites = [] +summaries = [] + +error_types = [ + "RuntimeError", + "ValueError", + "TypeError", + "AttributeError", + "KeyError", + "IndexError", + "ImportError", + "AssertionError", + "Exception", + "OSError", + "Failed", + "TimeoutError", + "asyncio.TimeoutError", + "FileNotFoundError", + "PermissionError", + "NotImplementedError", +] def get_classname(case): - return ' '.join(case.classname.split()) + return ' '.join(case.classname.split()) if hasattr(case, 'classname') else case.get('classname', '') def get_name(case): + if isinstance(case, dict): + return case.get('name', '') return ' '.join(case.name.split()) def get_result(case): + if isinstance(case, dict): + return case.get('status', 'failed') + result = "passed" if case.result: if isinstance(case.result[0], Error): @@ -28,91 +53,210 @@ def get_result(case): return result def get_message(case): + if isinstance(case, dict): + return case.get('error', '') + if not case.result: return "" - #return f" for line in {case.result[0].message.splitlines()}" - return [item for item in case.result[0].message.splitlines() if "Error:" in item] -def print_md_row(row, print_header): + full_text = case.result[0].text if hasattr(case.result[0], 'text') else case.result[0].message + if not full_text: + return "" + + error_messages = [] + capture_next_lines = False + indent_level = 0 + + for line in full_text.splitlines(): + stripped_line = line.strip() + if not stripped_line: + continue + + for error_type in error_types: + if stripped_line.startswith(error_type + ": "): + error_msg = stripped_line[len(error_type)+2:] + error_messages.append(f"{error_type}: {error_msg}") + capture_next_lines = True + indent_level = 0 + break + elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line: + error_msg = stripped_line.split(f'{error_type}:')[-1].strip() + error_messages.append(f"{error_type}: {error_msg}") + capture_next_lines = True + indent_level = 0 + break + + return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}" + + +def print_md_row(row, print_header=False): if print_header: - header = " | ".join([f"{key}" for key, _ in row.items()]) + header = " | ".join([f"{key}" for key in row.keys()]) print(f"| {header} |") - header = " | ".join(["-"*len(key) for key, _ in row.items()]) + header = " | ".join(["---"] * len(row)) print(f"| {header} |") - row = " | ".join([f"{value}" for _, value in row.items()]) - print(f"| {row} |") + row_values = " | ".join([f"{value}" for value in row.values()]) + print(f"| {row_values} |") + +def print_failures(): + if not failures: + return -def print_cases(cases): + print("### Test Failures") print_header = True - for case in cases: - classname = get_classname(case) - name = get_name(case) - result = get_result(case) - message = get_message(case) - row = { - 'Class name': classname, - 'Test name': name, - 'Status': result, - 'Message': message, - } - print_md_row(row, print_header) + for case in failures: + print_md_row({ + 'Class name': get_classname(case), + 'Test name': get_name(case), + 'Status': get_result(case), + 'Message': get_message(case), + 'Source': case['source'] if isinstance(case, dict) else 'XML' + }, print_header) print_header = False -def print_suite(suite): - print_header = True - for suite in suites: - ut = args.junitxml[0] - del(args.junitxml[0]) - ut = os.path.basename(ut).split('.')[0] - tests = suite.tests - skipped = suite.skipped - failures = suite.failures - errors = suite.errors - if ut == 'op_regression': - category = 'op_regression' - elif ut == 'op_regression_dev1': - category = 'op_regression_dev1' - elif ut == 'op_extended': - category = 'op_extended' - elif 'op_ut' in ut: - category = 'op_ut' +def parse_log_file(log_file): + with open(log_file, encoding='utf-8') as f: + content = f.read() + + ut_name = os.path.splitext(os.path.basename(log_file))[0] + summary = { + 'Category': determine_category(ut_name), + 'UT': ut_name, + 'Test cases': 0, + 'Passed': 0, + 'Skipped': 0, + 'Failures': 0, + 'Errors': 0, + 'Source': 'Log' + } + + # Extract test counts + test_run_match = re.search(r"Ran (\d+) tests in [\d.]+s", content) + if test_run_match: + summary['Test cases'] = int(test_run_match.group(1)) + + # Extract skipped case number + skipped_match = re.search(r"skipped[ =](\d+)", content, re.IGNORECASE) + if skipped_match: + summary['Skipped'] = int(skipped_match.group(1)) + else: + skipped_match = re.search(r"skipped (\d+) cases?", content, re.IGNORECASE) + if skipped_match: + summary['Skipped'] = int(skipped_match.group(1)) + + # Extract failures + failure_blocks = re.findall(r"(FAIL:.*?)(?:\n\n|\n=+\n|\Z)", content, re.DOTALL) + exist_test_names = set() + failures_number = 0 + + for block in failure_blocks: + case_match = re.match(r"FAIL: (\w+) \(__mp_main__\.(\w+)\)", block) + if not case_match: + continue + + test_name = case_match.group(1) + if test_name in exist_test_names: + continue + exist_test_names.add(test_name) + + error_msg = [] + error_pattern = r"(" + "|".join(error_types) + r"):.*?(?=\n\S|\n\n|\n=+\n|\Z)" + error_matches = re.finditer(error_pattern, block, re.DOTALL) + if not error_matches and "Traceback" in block: + error_msg.append("Unknown error (see traceback)") else: - category = "default" - row = { - 'Category': category, - 'UT': ut, - 'Test cases': tests, - 'Passed': tests-skipped-failures-errors, - 'Skipped': skipped, - 'Failures': failures, - 'Errors': errors, - } - print_md_row(row, print_header) + for match in error_matches: + error_msg.append(match.group(0).strip()) + + failures.append({ + 'classname': ut_name, + 'name': f"{case_match.group(2)}:{test_name}", + 'error': " ".join(error_msg), + 'status': 'failed', + 'source': 'Log' + }) + failures_number += 1 + + if failures_number > summary['Failures']: + summary['Failures'] = failures_number + summary['Passed'] = summary['Test cases'] - summary['Failures'] - summary['Skipped'] + + return summary + +def determine_category(ut): + if ut == 'op_regression': + return 'op_regression' + elif ut == 'op_regression_dev1': + return 'op_regression_dev1' + elif ut == 'op_extended': + return 'op_extended' + elif 'op_ut' in ut: + return 'op_ut' + else: + return 'unknown' + +def process_log_file(log_file): + try: + summary = parse_log_file(log_file) + summaries.append(summary) + except Exception as e: + print(f"Error processing {log_file}: {e}", file=sys.stderr) + +def process_xml_file(xml_file): + try: + xml = JUnitXml.fromfile(xml_file) + ut = os.path.basename(xml_file).split('.')[0] + category = determine_category(ut) + + for suite in xml: + suite_summary = { + 'Category': category, + 'UT': ut, + 'Test cases': suite.tests, + 'Passed': suite.tests - suite.skipped - suite.failures - suite.errors, + 'Skipped': suite.skipped, + 'Failures': suite.failures, + 'Errors': suite.errors, + 'Source': 'XML' + } + summaries.append(suite_summary) + + for case in suite: + if get_result(case) not in ["passed", "skipped"]: + failures.append(case) + except Exception as e: + print(f"Error processing {xml_file}: {e}", file=sys.stderr) + +def print_summary(): + print("### Results Summary") + print_header = True + + for summary in summaries: + print_md_row({ + 'Category': summary['Category'], + 'UT': summary['UT'], + 'Test cases': summary['Test cases'], + 'Passed': summary['Passed'], + 'Skipped': summary['Skipped'], + 'Failures': summary['Failures'], + 'Errors': summary['Errors'], + 'Source': summary['Source'] + }, print_header) + print_header = False -xmls = [ JUnitXml.fromfile(f) for f in args.junitxml ] -for idx, xml in enumerate(xmls): - for suite in xml: - suites.append(suite) - for case in suite: - classname = get_classname(case) - name = get_name(case) - result = get_result(case) - if result not in ["passed", "skipped"]: - failures.append(case) - -printed = False -def print_break(needed): - if needed: - print("") - -if failures: - print_break(printed) - print("### Failures") - print_cases(failures) - printed = True - -print("### Results Summary") -print_suite(suites) - -sys.exit(0) +def main(): + for input_file in args.input_files: + if input_file.endswith('.log'): + process_log_file(input_file) + elif input_file.endswith('.xml'): + process_xml_file(input_file) + else: + print(f"Skipping unknown file type: {input_file}", file=sys.stderr) + + print_failures() + print_summary() + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index f1410abec..8a4cc0b45 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -182,6 +182,18 @@ jobs: cd ../pytorch/third_party/torch-xpu-ops/test/xpu timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test_error.log | tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log cp *.xml ${{ github.workspace }}/ut_log + find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' + dir_path=$(dirname "$1"); + case "$dir_path" in + *"op_ut_with_skip_quantization/core"*) + dir_name="op_ut_with_skip_quantization_core";; + *) + dir_name=$(basename "$dir_path");; + esac; + mv "$1" "$dir_path/${dir_name}_$(basename "$1")" + ' _ {} \; + cp op_ut_with_skip_nn/*.xml ${{ github.workspace }}/ut_log + cp op_ut_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log # Cases run with a on-demand white list, since some suites are too # slow to go through all operators on CPU. So add cases on-demand # when XPU implementatoin is done. From 4c3651eae2af3fcb3d0d1c2039e4210110a01de7 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 17:03:10 +0000 Subject: [PATCH 31/83] update ut --- .github/workflows/_linux_ut.yml | 140 -------------------------------- 1 file changed, 140 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 8a4cc0b45..7ac9615dd 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -564,143 +564,3 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed path: ${{ github.workspace }}/ut_log - - pytorch_distributed_test: - runs-on: ${{ inputs.runner }} - if: contains(inputs.ut, 'pytorch_distributed') - timeout-minutes: 900 - env: - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Stock Pytorch - run: | - pwd - which conda && conda clean -ay - conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ - rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} - conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y - source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch - pip install requests - git clone https://github.com/daisyden/pytorch.git pytorch - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - fi - fi - - name: Triton Installation - run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../pytorch - TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" - if [ -z ${{ inputs.triton }} ]; then - TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" - else - TRITON_COMMIT_ID="${{ inputs.triton }}" - fi - echo ${TRITON_REPO}@${TRITON_COMMIT_ID} - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" - fi - - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} - uses: actions/download-artifact@v4 - with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} - path: ${{ github.workspace }} - - name: Install Pytorch XPU - run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - source .github/scripts/env.sh ${{ inputs.pytorch }} - pip install mkl-static==2025.0.1 mkl-include==2025.0.1 - if [[ ${{ inputs.abi }} == '0' ]]; then - export _GLIBCXX_USE_CXX11_ABI=0 - else - export _GLIBCXX_USE_CXX11_ABI=1 - fi - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd ../pytorch - export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - pip install -r requirements.txt - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. - else - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ../pytorch - git reset --hard && git checkout ${TORCH_COMMIT_ID} - TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log - cd ${{ github.workspace }} - sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - - name: UT Test Results Check - shell: bash - run: | - function contains() { - contains_status="echo 'Start $2 ...'" - { - [[ $1 =~ (^|,)$2($|,) ]] - } || { - echo "[Warning] $2 is not suppotted type! Skipped!" - contains_status="continue" - } - } - set -xe - echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - cd ${{ github.workspace }}/ut_log/pytorch_distributed - cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ - bash ut_result_check.sh 'pytorch_distributed' - - name: Upload Inductor XPU UT Log - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed - path: ${{ github.workspace }}/ut_log From 6f635a7c64e22fc28dadef515841acdf42ce04f7 Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Wed, 16 Apr 2025 02:30:33 -0700 Subject: [PATCH 32/83] add distributed ut summary --- .github/workflows/_linux_ut.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 7ac9615dd..183a95680 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -540,7 +540,14 @@ jobs: fi python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log cd ${{ github.workspace }} + mkdir -p ut_log/pytorch_distributed_summary + cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: Distributed UT Test Results Summary + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + pip install junitparser + python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/pytorch_distributed_summary/* >> $GITHUB_STEP_SUMMARY || true - name: UT Test Results Check shell: bash run: | From f0e1128d1ceccb8f49ac4285854ee81a52d1dc40 Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Wed, 16 Apr 2025 02:32:39 -0700 Subject: [PATCH 33/83] align the path --- .github/workflows/_linux_ut.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 183a95680..937425699 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -532,6 +532,7 @@ jobs: sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope mkdir -p ut_log/pytorch_distributed + mkdir -p ut_log/pytorch_distributed_summary cd ../pytorch/third_party/torch-xpu-ops/test/xpu XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE}}" == 'False' ]]; then @@ -539,9 +540,8 @@ jobs: exit 1 fi python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log - cd ${{ github.workspace }} - mkdir -p ut_log/pytorch_distributed_summary cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary + cd ${{ github.workspace }} sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - name: Distributed UT Test Results Summary run: | From e9b1ba9b060d85920f0f185317f0bce6a4d0a3cb Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 17:19:12 +0000 Subject: [PATCH 34/83] fix lint issue --- test/xpu/run_distributed_local.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index cc40373bc..9cd07cc55 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -83,18 +83,27 @@ def run(test_command): for line in err.split("\n"): if line.startswith("FAILED (failures="): num_errs = line.split("=")[1].split(")")[0].strip() - error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n") + error_log += ( + "FAILED (failures=" + + str(int(num_errs) - num_skipped) + + f" skipped {num_skipped} cases" + + ")\n") else: - error_log += (line + "\n") + error_log += line + "\n" else: - error_log += ("FAIL: " + err) + error_log += "FAIL: " + err else: if i == len(fail.stderr.split("FAIL: ")) - 1: error_log += "FAIL: " for line in err.split("\n"): if line.startswith("FAILED (failures="): num_errs = line.split("=")[1].split(")")[0].strip() - error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n") + error_log += ( + "FAILED (failures=" + + str(int(num_errs) - num_skipped) + + f" skipped {num_skipped} cases" + + ")\n" + ) renamed_key = key.replace("../../../../", "").replace("/", "_") if num_err > 0: @@ -102,7 +111,8 @@ def run(test_command): with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: f.write(error_log) else: - import pdb;pdb.set_trace() + import pdb + pdb.set_trace() with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: f.write(fail.stdout) f.write(fail.stderr) From 14773da97d3d5e8a4231a2a027e32a7007892abb Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 17:23:49 +0000 Subject: [PATCH 35/83] fix lint issue --- test/xpu/run_distributed_local.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 9cd07cc55..014a81235 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -49,6 +49,7 @@ from xpu_test_utils import launch_test + # run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) @@ -84,10 +85,11 @@ def run(test_command): if line.startswith("FAILED (failures="): num_errs = line.split("=")[1].split(")")[0].strip() error_log += ( - "FAILED (failures=" - + str(int(num_errs) - num_skipped) - + f" skipped {num_skipped} cases" - + ")\n") + "FAILED (failures=" + + str(int(num_errs) - num_skipped) + + f" skipped {num_skipped} cases" + + ")\n" + ) else: error_log += line + "\n" else: @@ -99,9 +101,9 @@ def run(test_command): if line.startswith("FAILED (failures="): num_errs = line.split("=")[1].split(")")[0].strip() error_log += ( - "FAILED (failures=" - + str(int(num_errs) - num_skipped) - + f" skipped {num_skipped} cases" + "FAILED (failures=" + + str(int(num_errs) - num_skipped) + + f" skipped {num_skipped} cases" + ")\n" ) @@ -112,6 +114,7 @@ def run(test_command): f.write(error_log) else: import pdb + pdb.set_trace() with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: f.write(fail.stdout) From 5d9d94bea923f6c6d45f21c3ee5feac523e166b3 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 17:29:15 +0000 Subject: [PATCH 36/83] fix lint issue --- test/xpu/skip_list_dist_local.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 9ec4c59e0..d7ef3b461 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -17,8 +17,8 @@ "test_checkpoint_submodule_use_reentrant_False_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_ddp_parity_xpu", + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_ddp_parity_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": ( From 5197d87713e5d85bc81fc0866a67a14728f3390f Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 17:50:43 +0000 Subject: [PATCH 37/83] update --- .github/workflows/_linux_ut.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 937425699..44e9cf90c 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -313,7 +313,7 @@ jobs: rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch + cd ../ && sudo rm -rf pytorch pip install requests git clone https://github.com/pytorch/pytorch pytorch if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then @@ -532,7 +532,6 @@ jobs: sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope mkdir -p ut_log/pytorch_distributed - mkdir -p ut_log/pytorch_distributed_summary cd ../pytorch/third_party/torch-xpu-ops/test/xpu XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE}}" == 'False' ]]; then @@ -540,8 +539,9 @@ jobs: exit 1 fi python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log - cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary cd ${{ github.workspace }} + mkdir -p ut_log/pytorch_distributed_summary + cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - name: Distributed UT Test Results Summary run: | From be64dbe1df07dc87502fdd255024a62c781f717b Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 18:01:42 +0000 Subject: [PATCH 38/83] update --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 44e9cf90c..d79848ed1 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -60,7 +60,7 @@ jobs: rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch + cd ../ && sudo rm -rf pytorch pip install requests git clone https://github.com/pytorch/pytorch pytorch if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then From 0e445777e5f6c7d8e3cd33bc2eb09c4352390512 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 18:23:09 +0000 Subject: [PATCH 39/83] update --- .github/workflows/_linux_build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index ae6c2064c..d1b9b98b0 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -63,7 +63,7 @@ jobs: rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_build conda create -n xpu_build python=${{ inputs.python }} cmake=3.28 ninja -y source activate xpu_build - cd ../ && rm -rf pytorch + cd ../ && sudo rm -rf pytorch pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch From d0a0609f33be30c748e9b5846f14e07dbccc9f5b Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Thu, 17 Apr 2025 01:30:28 +0000 Subject: [PATCH 40/83] comment pdb --- test/xpu/run_distributed_local.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 014a81235..9bdffb00c 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -113,9 +113,8 @@ def run(test_command): with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: f.write(error_log) else: - import pdb - - pdb.set_trace() + # import pdb + # pdb.set_trace() with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: f.write(fail.stdout) f.write(fail.stderr) From 65d1953354a7854bc62cb30baf0e5346858d2b1b Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Wed, 16 Apr 2025 18:18:08 -0700 Subject: [PATCH 41/83] align the path --- .github/workflows/_linux_ut.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index d79848ed1..5b81ce89d 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -532,16 +532,16 @@ jobs: sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope mkdir -p ut_log/pytorch_distributed + mkdir -p ut_log/pytorch_distributed_summary cd ../pytorch/third_party/torch-xpu-ops/test/xpu XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE}}" == 'False' ]]; then echo -e "[ERROR] XCCL is not enabled" exit 1 fi - python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log - cd ${{ github.workspace }} - mkdir -p ut_log/pytorch_distributed_summary + python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary + cd ${{ github.workspace }} sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - name: Distributed UT Test Results Summary run: | From 415abe78948428338bdf88891cb3b6b6bba7ec25 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Fri, 18 Apr 2025 02:17:54 -0700 Subject: [PATCH 42/83] Skipped error cases Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 1 - test/xpu/skip_list_dist_local.py | 94 ++++++++++++++++++++++++++++--- test/xpu/xpu_test_utils.py | 7 ++- 3 files changed, 89 insertions(+), 13 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index cc40373bc..cb0a5024b 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -102,7 +102,6 @@ def run(test_command): with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: f.write(error_log) else: - import pdb;pdb.set_trace() with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: f.write(fail.stdout) f.write(fail.stderr) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 9ec4c59e0..91af20a1a 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -134,7 +134,9 @@ # https://github.com/intel/torch-xpu-ops/issues/1504 "test_fsdp_optimizer_overlap", ), - "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, + "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": ( + "test_buffer_dtype_no_root_handle", + ), "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 "test_multi_forward_cpu", @@ -247,6 +249,10 @@ "test_gather_object_xpu", "test_gather_object_list_cpu", "test_gather_object_list_xpu", + # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal! + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_scatter_object_list_cpu", + "test_scatter_object_list_xpu", ), "../../../../test/distributed/test_compute_comm_reordering.py": None, "../../../../test/distributed/test_control_collectives.py": None, @@ -365,11 +371,34 @@ # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered. # https://github.com/intel/torch-xpu-ops/issues/1556 "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_loss_parallel", + "test_mlp_training_is_seq_parallel_False_recompute_activation_False", + "test_mlp_training_is_seq_parallel_True_recompute_activation_False", + "test_transformer_req_grad_float64_thaw_all", + "test_transformer_training_is_seq_parallel_False_float64", + "test_transformer_training_is_seq_parallel_True_float64", + "test_sequence_parallel_style", ), "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, - "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None, + "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_linear_col_wise_parallel", + "test_parallelize_mlp_with_module_api", + "test_parallelize_mlp_with_module_api_nested", + "test_parallelize_module_multi_wildcard", + "test_parallelize_module_src_data_rank", + "test_parallelize_module_with_digit", + "test_parallelize_module_with_question", + "test_parallelize_module_with_star", + "test_under_devicemesh_context", + ), "../../../../test/distributed/tensor/parallel/test_tp_style.py": None, - "../../../../test/distributed/tensor/test_api.py": None, + "../../../../test/distributed/tensor/test_api.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_distribute_tensor_rank", + "test_distribute_tensor_uneven_sharding", + ), "../../../../test/distributed/tensor/test_attention.py": None, "../../../../test/distributed/tensor/test_common_rules.py": None, "../../../../test/distributed/tensor/test_dtensor.py": ( @@ -401,6 +430,15 @@ # https://github.com/intel/torch-xpu-ops/issues/1508 "test_mean", "test_nll_loss_and_cross_entropy", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_cumsum", + "test_layer_norm_bwd", + "test_layer_norm_bwd_req_grad", + "test_layer_norm_fwd", + "test_linear_op_reductions", + "test_shard0_svd", + "test_softmax_fwd", + "test_topk", ), "../../../../test/distributed/tensor/test_random_ops.py": ( # Need to update world size @@ -409,12 +447,39 @@ "../../../../test/distributed/tensor/test_redistribute.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 "test_redistribute_shard_dim_multi_dim_mesh", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_redistribute_shard_dim_change", + "test_redistribute_uneven_sharding", + "test_shard_to_replicate_forward_backward", + "test_shard_to_replicate_forward_backward_datatype_conversion", + "test_multi_dim_mesh", + ), + "../../../../test/distributed/tensor/test_tensor_ops.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_aten_contiguous", + "test_gather", + "test_index", + "test_op_out_variant", + "test_slice", + "test_stack", + "test_where_type_promotion", + ), + "../../../../test/distributed/tensor/experimental/test_register_sharding.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_argmax", + "test_softmax_fwd", + ), + "../../../../test/distributed/_shard/test_sharder.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_custom_sharder", ), - "../../../../test/distributed/tensor/test_tensor_ops.py": None, - "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None, - "../../../../test/distributed/_shard/test_sharder.py": None, # FSDP2 - "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_nontensor_activations", + "test_unused_forward_module", + "test_unused_forward_output", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 "test_clip_grad_norm_2d", @@ -425,7 +490,10 @@ ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_train_mixed_requires_grad_per_group", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": ( # https://github.com/intel/torch-xpu-ops/issues/1508 "test_gradient_scaler", @@ -437,7 +505,12 @@ # https://github.com/intel/torch-xpu-ops/issues/1535 "test_fully_shard_training_memory", ), - "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_compute_dtype", + "test_grad_acc_with_reduce_dtype", + "test_reduce_dtype", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": ( # Performance test, should skip "test_fully_shard_training_overlap", @@ -453,6 +526,9 @@ # https://github.com/intel/torch-xpu-ops/issues/1504 "test_train_parity_multi_group_unshard_async_op", "test_train_parity_with_activation_checkpointing", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_1f1b_microbatching", + "test_gradient_accumulation", ), } diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 92a91355a..d58d3d9a5 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1153,6 +1153,7 @@ def copy_tests( def launch_test(test_case, skip_list=None, exe_list=None): os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1" os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" + rename = test_case.replace("../../../../", "").replace("/", "_") if skip_list is not None: skip_options = ' -k "not ' + skip_list[0] for skip_case in skip_list[1:]: @@ -1160,7 +1161,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - f"pytest -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) test_command += skip_options elif exe_list is not None: @@ -1170,11 +1171,11 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - f"pytest -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) test_command += exe_options else: test_command = ( - f"pytest -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) return os.system(test_command) From c555fbba106852870051801e7c187228b65c7791 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Fri, 18 Apr 2025 02:31:41 -0700 Subject: [PATCH 43/83] fixed lint error Signed-off-by: Cheng, Penghui --- test/xpu/xpu_test_utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 79a2b6eb6..8d451523a 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1172,13 +1172,11 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " - + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) test_command += exe_options else: test_command = ( - f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " - + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) return os.system(test_command) From 6d6a75e7262f176ee76670d09eeaad26c72d8669 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Fri, 18 Apr 2025 02:33:55 -0700 Subject: [PATCH 44/83] fixed lint error Signed-off-by: Cheng, Penghui --- test/xpu/xpu_test_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 8d451523a..d58d3d9a5 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1161,8 +1161,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " - + test_case + f"pytest -v --junit-xml=./op_ut_with_skip_{rename}.xml " + test_case ) test_command += skip_options elif exe_list is not None: From 1f451b2907297085322ea4133dbeb133434a3b60 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Wed, 23 Apr 2025 23:23:57 -0700 Subject: [PATCH 45/83] Add some UT cases Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 48 +++++++++++++++++--------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 4dd928ea9..49584c65a 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -282,16 +282,22 @@ "test_fsdp_setattr", "test_fsdp_unspecialized_forced_getattr_inline", "test_fsdp_unspecialized_forced_getattr_no_inline", - # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) - # https://github.com/intel/torch-xpu-ops/issues/1526 - "test_get_pg_attr", ), "../../../../test/distributed/test_fake_pg.py": None, - "../../../../test/distributed/test_functional_api.py": ( - # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) - # https://github.com/intel/torch-xpu-ops/issues/1526 - "test_tracing_xpu", - "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu", + "../../../../test/distributed/test_functional_api.py": None, + "../../../../test/distributed/test_inductor_collectives.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1581 + # Fatal Python error: Segmentation fault + "test_dynamo_rewrite_dist_all_gather", + "test_dynamo_rewrite_dist_all_gather_list", + "test_dynamo_rewrite_dist_all_gather_args_match", + "test_dynamo_rewrite_dist_reduce_scatter", + "test_dynamo_support_collective_op_with_async_op_False", + "test_dynamo_trace_reduce_scatter_tensor", + "test_dynamo_trace_all_gather_tensor", + "test_dynamo_trace_allgather_coalesced", + "test_inductor_reduce_scatter_coalesced", + "test_inductor_all_gather_coalesced", ), "../../../../test/distributed/test_multi_threaded_pg.py": ( # oneccl not support multi-threaded well, so skip it first. @@ -401,21 +407,7 @@ ), "../../../../test/distributed/tensor/test_attention.py": None, "../../../../test/distributed/tensor/test_common_rules.py": None, - "../../../../test/distributed/tensor/test_dtensor.py": ( - # Passed with updated test code for world_size 8 - "test_auto_implicit_replication", - "test_default_value_sub_mesh", - "test_device_mesh_nd", - "test_dtensor_2d_mesh", - "test_dtensor_api_device_mesh_context_manager", - "test_dtensor_device_mesh_device_conversion", - "test_dtensor_spec_local_shard_offset", - "test_from_local_sub_mesh", - "test_implicit_replication", - "test_metadata_consistency_check", - "test_redistribute_sub_mesh", - "test_split_tensor_1D", - ), + "../../../../test/distributed/tensor/test_dtensor.py": None, "../../../../test/distributed/tensor/test_dtensor_compile.py": ( # https://jira.devtools.intel.com/browse/MLSL-3625 "test_2d_fsdp_tp_compile", @@ -530,6 +522,16 @@ "test_1f1b_microbatching", "test_gradient_accumulation", ), + "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + "test_tracker_multi_group_eager", + "test_tracker_non_root_forward_backward", + "test_tracker_with_activation_checkpointing", + ), + "../../../../test/distributed/_tools/test_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_memory_tracker.py": None, + "../../../../test/distributed/_tools/test_mod_tracker.py": None, } skip_dict_python = { From b2c5875eb61426dfb162d68af4d056d9f94ca07a Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Tue, 29 Apr 2025 03:10:18 -0700 Subject: [PATCH 46/83] Add UT cases for _shard and _tools folder Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 38 ++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 49584c65a..9626f6e35 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -461,10 +461,6 @@ "test_argmax", "test_softmax_fwd", ), - "../../../../test/distributed/_shard/test_sharder.py": ( - # https://jira.devtools.intel.com/browse/MLSL-3625 - "test_custom_sharder", - ), # FSDP2 "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": ( # https://jira.devtools.intel.com/browse/MLSL-3625 @@ -522,6 +518,40 @@ "test_1f1b_microbatching", "test_gradient_accumulation", ), + "../../../../test/distributed/_composable/test_replicate_with_compiler.py": ( + # AssertionError: Tensor-likes are not close! + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_compile_backward_only", + "test_compile_bf16", + "test_compile_fp16", + "test_compile_gpu", + "test_compile_gpu_ac", + ), + "../../../../test/distributed/_shard/test_sharder.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_custom_sharder", + ), + "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None, + "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": { + # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) + "test_complete_world_size", + "test_multiple_local_shards", + "test_new_group", + "test_partial_world_size", + "test_grid_sharding", + "test_multiple_local_shards", + "test_new_group", + "test_partial_world_size", + "test_with_rpc_names", + "test_init_from_local_tensor", + # what(): Attempting to send a Tensor with unexpected device type xpu:3 + # https://github.com/intel/torch-xpu-ops/issues/1616 + "test_init_from_local_shards", + "test_init_from_local_shards_and_global_metadata", + }, + "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None, + "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None, + "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None, "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": ( # https://github.com/intel/torch-xpu-ops/issues/1508 # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path From 177d7c0cc0f3e9223695e3e5b6ee782a9564f3e8 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Mon, 5 May 2025 03:18:25 -0700 Subject: [PATCH 47/83] Clean skip list Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 141 ++++++++++++++++--------------- 1 file changed, 73 insertions(+), 68 deletions(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 9626f6e35..0254e69a3 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -9,7 +9,7 @@ "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", + # "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False", "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", @@ -37,7 +37,7 @@ ), "../../../../test/distributed/fsdp/test_fsdp_core.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_delayed_optim_step_offload_true_no_shard_xpu", + # "test_delayed_optim_step_offload_true_no_shard_xpu", "test_transformer_no_grad_mixed_precision_True_xpu", "test_delayed_optim_step_offload_false_no_shard_xpu", "test_delayed_optim_step_offload_false_none_xpu", @@ -74,10 +74,11 @@ "test_transformer_offload_true_none_xpu", "test_transformer_offload_true_shard_grad_op_xpu", ), - "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1504 - " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, + # ( + # # https://github.com/intel/torch-xpu-ops/issues/1504 + # " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", + # ), "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 @@ -129,14 +130,15 @@ "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1535 - "test_fsdp_zero2_eval_with_prefetch", + # # https://github.com/intel/torch-xpu-ops/issues/1535 + # "test_fsdp_zero2_eval_with_prefetch", # https://github.com/intel/torch-xpu-ops/issues/1504 "test_fsdp_optimizer_overlap", ), - "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": ( - "test_buffer_dtype_no_root_handle", - ), + "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, + # ( + # "test_buffer_dtype_no_root_handle", + # ), "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 "test_multi_forward_cpu", @@ -225,30 +227,30 @@ # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path "test_reduce_scatter_tensor_coalesced", "test_reduce_scatter_tensor_single", - # https://github.com/intel/torch-xpu-ops/issues/1525 - # ValueError: trying to initialize the default process group twice! - "test_inductor_all_gather_into_tensor_coalesced", - "test_inductor_all_gather_into_tensor_single", - "test_inductor_all_reduce_coalesced", - "test_inductor_all_reduce_non_contig_input", - "test_inductor_all_reduce_single", - "test_inductor_all_to_all_single", - "test_inductor_broadcast", - "test_inductor_inplace_op_on_view", - "test_inductor_reduce_scatter_tensor_coalesced", - "test_inductor_reduce_scatter_tensor_single", - "test_inductor_reuse_buffer_after_inplace_collective", - "test_ranks_and_tag", - "test_wait_tensor", + # # https://github.com/intel/torch-xpu-ops/issues/1525 + # # ValueError: trying to initialize the default process group twice! + # "test_inductor_all_gather_into_tensor_coalesced", + # "test_inductor_all_gather_into_tensor_single", + # "test_inductor_all_reduce_coalesced", + # "test_inductor_all_reduce_non_contig_input", + # "test_inductor_all_reduce_single", + # "test_inductor_all_to_all_single", + # "test_inductor_broadcast", + # "test_inductor_inplace_op_on_view", + # "test_inductor_reduce_scatter_tensor_coalesced", + # "test_inductor_reduce_scatter_tensor_single", + # "test_inductor_reuse_buffer_after_inplace_collective", + # "test_ranks_and_tag", + # "test_wait_tensor", ), "../../../../test/distributed/test_c10d_logger.py": None, "../../../../test/distributed/test_c10d_object_collectives.py": ( - # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds - # https://github.com/intel/torch-xpu-ops/issues/1535 - "test_gather_object_cpu", - "test_gather_object_xpu", - "test_gather_object_list_cpu", - "test_gather_object_list_xpu", + # # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds + # # https://github.com/intel/torch-xpu-ops/issues/1535 + # "test_gather_object_cpu", + # "test_gather_object_xpu", + # "test_gather_object_list_cpu", + # "test_gather_object_list_xpu", # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal! # https://jira.devtools.intel.com/browse/MLSL-3625 "test_scatter_object_list_cpu", @@ -258,30 +260,30 @@ "../../../../test/distributed/test_control_collectives.py": None, "../../../../test/distributed/test_device_mesh.py": None, "../../../../test/distributed/test_dynamo_distributed.py": ( - # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' - "test_asymmetric_compilation", - "test_asymmetric_compilation_with_fx_cache", - # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. - "test_compiled_flex_attention_full_model_ddp", - "test_compiled_flex_attention_local_ddp", + # # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' + # "test_asymmetric_compilation", + # "test_asymmetric_compilation_with_fx_cache", + # # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. + # "test_compiled_flex_attention_full_model_ddp", + # "test_compiled_flex_attention_local_ddp", # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ # https://github.com/intel/torch-xpu-ops/issues/1527 - "test_compiler_collectives_automatic_dynamic_scalar", - "test_compiler_collectives_automatic_dynamic_speculation_divergence", - "test_compiler_collectives_automatic_dynamic_tensor", - "test_compiler_collectives_dim_mismatch", - "test_compiler_collectives_graph_break_empty_graph_still_collective", - "test_compiler_collectives_missing_source", - "test_compiler_collectives_scalar_missing_source", - "test_compiler_collectives_type_mismatch", - "test_ddp_activation_checkpointing", - "test_ddp_baseline_aot_eager_multiprocess", - "test_fsdp_activation_checkpointing", - "test_fsdp_aot_eager", - "test_fsdp_inductor", + # "test_compiler_collectives_automatic_dynamic_scalar", + # "test_compiler_collectives_automatic_dynamic_speculation_divergence", + # "test_compiler_collectives_automatic_dynamic_tensor", + # "test_compiler_collectives_dim_mismatch", + # "test_compiler_collectives_graph_break_empty_graph_still_collective", + # "test_compiler_collectives_missing_source", + # "test_compiler_collectives_scalar_missing_source", + # "test_compiler_collectives_type_mismatch", + # "test_ddp_activation_checkpointing", + # "test_ddp_baseline_aot_eager_multiprocess", + # "test_fsdp_activation_checkpointing", + # "test_fsdp_aot_eager", + # "test_fsdp_inductor", "test_fsdp_setattr", - "test_fsdp_unspecialized_forced_getattr_inline", - "test_fsdp_unspecialized_forced_getattr_no_inline", + # "test_fsdp_unspecialized_forced_getattr_inline", + # "test_fsdp_unspecialized_forced_getattr_no_inline", ), "../../../../test/distributed/test_fake_pg.py": None, "../../../../test/distributed/test_functional_api.py": None, @@ -299,11 +301,12 @@ "test_inductor_reduce_scatter_coalesced", "test_inductor_all_gather_coalesced", ), - "../../../../test/distributed/test_multi_threaded_pg.py": ( - # oneccl not support multi-threaded well, so skip it first. - # https://github.com/intel/torch-xpu-ops/issues/1509 - "test_bwd_sees_fwd_pg", - ), + "../../../../test/distributed/test_multi_threaded_pg.py": None, + # ( + # # oneccl not support multi-threaded well, so skip it first. + # # https://github.com/intel/torch-xpu-ops/issues/1509 + # "test_bwd_sees_fwd_pg", + # ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 @@ -408,10 +411,11 @@ "../../../../test/distributed/tensor/test_attention.py": None, "../../../../test/distributed/tensor/test_common_rules.py": None, "../../../../test/distributed/tensor/test_dtensor.py": None, - "../../../../test/distributed/tensor/test_dtensor_compile.py": ( - # https://jira.devtools.intel.com/browse/MLSL-3625 - "test_2d_fsdp_tp_compile", - ), + "../../../../test/distributed/tensor/test_dtensor_compile.py": None, + # ( + # # https://jira.devtools.intel.com/browse/MLSL-3625 + # "test_2d_fsdp_tp_compile", + # ), "../../../../test/distributed/tensor/test_experimental_ops.py": ( # https://github.com/intel/torch-xpu-ops/issues/1535 "test_bernoulli", @@ -451,10 +455,10 @@ "test_aten_contiguous", "test_gather", "test_index", - "test_op_out_variant", + # "test_op_out_variant", "test_slice", "test_stack", - "test_where_type_promotion", + # "test_where_type_promotion", ), "../../../../test/distributed/tensor/experimental/test_register_sharding.py": ( # https://jira.devtools.intel.com/browse/MLSL-3625 @@ -482,10 +486,11 @@ # https://jira.devtools.intel.com/browse/MLSL-3625 "test_train_mixed_requires_grad_per_group", ), - "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1508 - "test_gradient_scaler", - ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None, + # ( + # # https://github.com/intel/torch-xpu-ops/issues/1508 + # "test_gradient_scaler", + # ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, From 939352d8723082e6ae8b0570653214164e69e92d Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Wed, 14 May 2025 20:54:51 -0700 Subject: [PATCH 48/83] clean skip list for distributed Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 152 +++++++++++-------------------- 1 file changed, 51 insertions(+), 101 deletions(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 0254e69a3..5f70354f8 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -8,8 +8,8 @@ "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1504 - # "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", + # https://github.com/intel/torch-xpu-ops/issues/1666, 2.8 skipped + "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False", "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", @@ -37,7 +37,6 @@ ), "../../../../test/distributed/fsdp/test_fsdp_core.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 - # "test_delayed_optim_step_offload_true_no_shard_xpu", "test_transformer_no_grad_mixed_precision_True_xpu", "test_delayed_optim_step_offload_false_no_shard_xpu", "test_delayed_optim_step_offload_false_none_xpu", @@ -75,10 +74,6 @@ "test_transformer_offload_true_shard_grad_op_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, - # ( - # # https://github.com/intel/torch-xpu-ops/issues/1504 - # " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", - # ), "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 @@ -130,21 +125,15 @@ "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( - # # https://github.com/intel/torch-xpu-ops/issues/1535 - # "test_fsdp_zero2_eval_with_prefetch", - # https://github.com/intel/torch-xpu-ops/issues/1504 + # https://github.com/intel/torch-xpu-ops/issues/1504, Performance test, should skip "test_fsdp_optimizer_overlap", ), "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, - # ( - # "test_buffer_dtype_no_root_handle", - # ), "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 "test_multi_forward_cpu", ), "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, - # https://github.com/intel/torch-xpu-ops/issues/1537 "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( # https://github.com/intel/torch-xpu-ops/issues/1504 "test_flatten_sharded_optim_state_dict_nested", @@ -245,12 +234,6 @@ ), "../../../../test/distributed/test_c10d_logger.py": None, "../../../../test/distributed/test_c10d_object_collectives.py": ( - # # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds - # # https://github.com/intel/torch-xpu-ops/issues/1535 - # "test_gather_object_cpu", - # "test_gather_object_xpu", - # "test_gather_object_list_cpu", - # "test_gather_object_list_xpu", # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal! # https://jira.devtools.intel.com/browse/MLSL-3625 "test_scatter_object_list_cpu", @@ -260,35 +243,14 @@ "../../../../test/distributed/test_control_collectives.py": None, "../../../../test/distributed/test_device_mesh.py": None, "../../../../test/distributed/test_dynamo_distributed.py": ( - # # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' - # "test_asymmetric_compilation", - # "test_asymmetric_compilation_with_fx_cache", - # # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. - # "test_compiled_flex_attention_full_model_ddp", - # "test_compiled_flex_attention_local_ddp", - # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ - # https://github.com/intel/torch-xpu-ops/issues/1527 - # "test_compiler_collectives_automatic_dynamic_scalar", - # "test_compiler_collectives_automatic_dynamic_speculation_divergence", - # "test_compiler_collectives_automatic_dynamic_tensor", - # "test_compiler_collectives_dim_mismatch", - # "test_compiler_collectives_graph_break_empty_graph_still_collective", - # "test_compiler_collectives_missing_source", - # "test_compiler_collectives_scalar_missing_source", - # "test_compiler_collectives_type_mismatch", - # "test_ddp_activation_checkpointing", - # "test_ddp_baseline_aot_eager_multiprocess", - # "test_fsdp_activation_checkpointing", - # "test_fsdp_aot_eager", - # "test_fsdp_inductor", + # AssertionError: 'setattr() on Tensor.requires_grad' not found in 'Attempted to call function marked as skipped + # https://github.com/intel/torch-xpu-ops/issues/1667, 2.8 skipped "test_fsdp_setattr", - # "test_fsdp_unspecialized_forced_getattr_inline", - # "test_fsdp_unspecialized_forced_getattr_no_inline", ), "../../../../test/distributed/test_fake_pg.py": None, "../../../../test/distributed/test_functional_api.py": None, "../../../../test/distributed/test_inductor_collectives.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1581 + # https://github.com/intel/torch-xpu-ops/issues/1581, 2.8 skipped # Fatal Python error: Segmentation fault "test_dynamo_rewrite_dist_all_gather", "test_dynamo_rewrite_dist_all_gather_list", @@ -302,22 +264,19 @@ "test_inductor_all_gather_coalesced", ), "../../../../test/distributed/test_multi_threaded_pg.py": None, + "../../../../test/distributed/test_store.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, # ( - # # oneccl not support multi-threaded well, so skip it first. - # # https://github.com/intel/torch-xpu-ops/issues/1509 - # "test_bwd_sees_fwd_pg", + # # https://github.com/intel/torch-xpu-ops/issues/1504 + # "test_stage_backward_weight_multiple_iters_xpu", + # "test_stage_backward_weight_xpu", + # "test_stage_backward_xpu", + # ), + "../../../../test/distributed/pipelining/test_microbatch.py": None, + # ( + # # https://github.com/intel/torch-xpu-ops/issues/1504, need retest with oneccl fix + # "test_chunk_spec_xpu", # ), - "../../../../test/distributed/test_store.py": None, - "../../../../test/distributed/pipelining/test_backward.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_stage_backward_weight_multiple_iters_xpu", - "test_stage_backward_weight_xpu", - "test_stage_backward_xpu", - ), - "../../../../test/distributed/pipelining/test_microbatch.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_chunk_spec_xpu", - ), "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, @@ -325,7 +284,7 @@ "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": ( # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter' # is not currently implemented for the XPU device - # https://github.com/intel/torch-xpu-ops/issues/1547 + # https://github.com/intel/torch-xpu-ops/issues/1547, 2.8 skipped "test_dtensor_seq_par_shard_dim_0", "test_dtensor_seq_par_shard_dim_1", "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0", @@ -334,7 +293,7 @@ "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1", "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2", # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......' - # https://github.com/intel/torch-xpu-ops/issues/1548 + # https://github.com/intel/torch-xpu-ops/issues/1548, 2.8 skipped "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False", "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True", "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False", @@ -342,7 +301,7 @@ "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False", "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True", # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......' - # https://github.com/intel/torch-xpu-ops/issues/1549 + # https://github.com/intel/torch-xpu-ops/issues/1549, 2.8 skipped "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False", "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True", "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False", @@ -350,14 +309,14 @@ "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False", "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True", # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device. - # https://github.com/intel/torch-xpu-ops/issues/1550 + # https://github.com/intel/torch-xpu-ops/issues/1550, 2.8 skipped "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False", "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True", "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False", "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True", # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter' # is not currently implemented for the XPU device. - # https://github.com/intel/torch-xpu-ops/issues/1551 + # https://github.com/intel/torch-xpu-ops/issues/1551, 2.8 skipped "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0", "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1", "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0", @@ -369,7 +328,7 @@ ), "../../../../test/distributed/tensor/parallel/test_tp_examples.py": ( # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! - # https://github.com/intel/torch-xpu-ops/issues/1555 + # https://github.com/intel/torch-xpu-ops/issues/1555, 2.8 skipped "test_transformer_req_grad_seq_parallel_float32_thaw_all", "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings", "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings", @@ -378,7 +337,7 @@ "test_transformer_training_is_seq_parallel_False_float32", "test_transformer_training_is_seq_parallel_True_float32", # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered. - # https://github.com/intel/torch-xpu-ops/issues/1556 + # https://github.com/intel/torch-xpu-ops/issues/1556, 2.8 skipped "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output", # https://jira.devtools.intel.com/browse/MLSL-3625 "test_loss_parallel", @@ -412,18 +371,13 @@ "../../../../test/distributed/tensor/test_common_rules.py": None, "../../../../test/distributed/tensor/test_dtensor.py": None, "../../../../test/distributed/tensor/test_dtensor_compile.py": None, - # ( - # # https://jira.devtools.intel.com/browse/MLSL-3625 - # "test_2d_fsdp_tp_compile", - # ), "../../../../test/distributed/tensor/test_experimental_ops.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1535 + # https://github.com/intel/torch-xpu-ops/issues/1604 "test_bernoulli", ), "../../../../test/distributed/tensor/test_init.py": None, "../../../../test/distributed/tensor/test_math_ops.py": ( - # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path - # https://github.com/intel/torch-xpu-ops/issues/1508 + # https://github.com/intel/torch-xpu-ops/issues/1604, hang "test_mean", "test_nll_loss_and_cross_entropy", # https://jira.devtools.intel.com/browse/MLSL-3625 @@ -436,13 +390,8 @@ "test_softmax_fwd", "test_topk", ), - "../../../../test/distributed/tensor/test_random_ops.py": ( - # Need to update world size - "test_hsdp_tp_model_meta_init", - ), + "../../../../test/distributed/tensor/test_random_ops.py": None, "../../../../test/distributed/tensor/test_redistribute.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_redistribute_shard_dim_multi_dim_mesh", # https://jira.devtools.intel.com/browse/MLSL-3625 "test_redistribute_shard_dim_change", "test_redistribute_uneven_sharding", @@ -455,10 +404,9 @@ "test_aten_contiguous", "test_gather", "test_index", - # "test_op_out_variant", "test_slice", "test_stack", - # "test_where_type_promotion", + "test_where_type_promotion", ), "../../../../test/distributed/tensor/experimental/test_register_sharding.py": ( # https://jira.devtools.intel.com/browse/MLSL-3625 @@ -473,29 +421,31 @@ "test_unused_forward_output", ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1504 + # https://github.com/intel/torch-xpu-ops/issues/1661 "test_clip_grad_norm_2d", ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1571 + # ValueError: Cannot use ReduceOp.PREMUL_SUM with XCCL + # https://github.com/intel/torch-xpu-ops/issues/1571, 2.8 skipped "test_set_reduce_scatter_divide_factor", ), - "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": ( + # torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised + # https://github.com/intel/torch-xpu-ops/issues/1665, 2.8 skipped + "test_transformer_backend_inductor_fullgraph_True", + "test_nested_fully_shard_backend_inductor_fullgraph_True", + ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": ( # https://jira.devtools.intel.com/browse/MLSL-3625 "test_train_mixed_requires_grad_per_group", ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None, - # ( - # # https://github.com/intel/torch-xpu-ops/issues/1508 - # "test_gradient_scaler", - # ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1535 + # https://github.com/intel/torch-xpu-ops/issues/1605 "test_fully_shard_training_memory", ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": ( @@ -509,15 +459,15 @@ "test_fully_shard_training_overlap", ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1572 + # Expected zero exit code but got -9 + # https://github.com/intel/torch-xpu-ops/issues/1663 "test_dp_state_dict_cpu_offload", ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1508 - "test_post_optim_event", - # https://github.com/intel/torch-xpu-ops/issues/1504 + # https://github.com/intel/torch-xpu-ops/issues/1661 "test_train_parity_multi_group_unshard_async_op", + # checkpointing issue, 2.8 skipped "test_train_parity_with_activation_checkpointing", # https://jira.devtools.intel.com/browse/MLSL-3625 "test_1f1b_microbatching", @@ -525,7 +475,7 @@ ), "../../../../test/distributed/_composable/test_replicate_with_compiler.py": ( # AssertionError: Tensor-likes are not close! - # https://github.com/intel/torch-xpu-ops/issues/1504 + # https://github.com/intel/torch-xpu-ops/issues/1668, 2.8 skipped "test_compile_backward_only", "test_compile_bf16", "test_compile_fp16", @@ -539,6 +489,7 @@ "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None, "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": { # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) + # https://github.com/intel/torch-xpu-ops/issues/1617, 2.8 skipped "test_complete_world_size", "test_multiple_local_shards", "test_new_group", @@ -550,20 +501,19 @@ "test_with_rpc_names", "test_init_from_local_tensor", # what(): Attempting to send a Tensor with unexpected device type xpu:3 - # https://github.com/intel/torch-xpu-ops/issues/1616 + # https://github.com/intel/torch-xpu-ops/issues/1616, 2.8 skipped "test_init_from_local_shards", "test_init_from_local_shards_and_global_metadata", }, "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None, "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None, "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None, - "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1508 - # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path - "test_tracker_multi_group_eager", - "test_tracker_non_root_forward_backward", - "test_tracker_with_activation_checkpointing", - ), + "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None, + # ( + # # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # # https://github.com/intel/torch-xpu-ops/issues/1508, 2.8 skipped + # "test_tracker_with_activation_checkpointing", + # ), "../../../../test/distributed/_tools/test_mem_tracker.py": None, "../../../../test/distributed/_tools/test_memory_tracker.py": None, "../../../../test/distributed/_tools/test_mod_tracker.py": None, From 1533b9b70f364f5b48693d22858143d87d37373b Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Wed, 14 May 2025 21:31:30 -0700 Subject: [PATCH 49/83] Add comments for skip list Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 5f70354f8..d99dd2bd0 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -8,6 +8,7 @@ "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( + # Accuracy gap in FSDP checkpoint related UT # https://github.com/intel/torch-xpu-ops/issues/1666, 2.8 skipped "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", @@ -17,11 +18,13 @@ "test_checkpoint_submodule_use_reentrant_False_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504 "test_ddp_parity_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504 "test_bf16_hook_has_wrapping_False_sharding_strategy0", "test_bf16_hook_has_wrapping_False_sharding_strategy1", @@ -36,6 +39,7 @@ "test_fp16_hook_has_wrapping_True_sharding_strategy2", ), "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504 "test_transformer_no_grad_mixed_precision_True_xpu", "test_delayed_optim_step_offload_false_no_shard_xpu", @@ -76,6 +80,7 @@ "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504 "test_hooks_multi_traversal_xpu", "test_parity_with_ddp_xpu", @@ -83,6 +88,7 @@ ), "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504 "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", @@ -125,16 +131,19 @@ "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504, Performance test, should skip "test_fsdp_optimizer_overlap", ), "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504 "test_multi_forward_cpu", ), "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504 "test_flatten_sharded_optim_state_dict_nested", "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", @@ -176,12 +185,14 @@ ), # Performance check, skip # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( - # # https://github.com/intel/torch-xpu-ops/issues/1504 + # # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504 # "test_forward_overlap", # "test_forward_overlap_xpu", # ), "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504 "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none", "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none", @@ -189,6 +200,7 @@ "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none", ), "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504 "test_state_dict_save_load_flow_state_dict_type_local_state_dict", "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict", @@ -199,6 +211,7 @@ "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": ( + # fsdp accuracy gaps # https://github.com/intel/torch-xpu-ops/issues/1504 "test_diff_hyperparams_sharding_strategy_str_full_shard", "test_diff_hyperparams_sharding_strategy_str_no_shard", @@ -267,14 +280,16 @@ "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, # ( - # # https://github.com/intel/torch-xpu-ops/issues/1504 + # # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504 # "test_stage_backward_weight_multiple_iters_xpu", # "test_stage_backward_weight_xpu", # "test_stage_backward_xpu", # ), "../../../../test/distributed/pipelining/test_microbatch.py": None, # ( - # # https://github.com/intel/torch-xpu-ops/issues/1504, need retest with oneccl fix + # # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504, need retest with oneccl fix # "test_chunk_spec_xpu", # ), "../../../../test/distributed/pipelining/test_pipe.py": None, From d74615fea55069839956d58c2b1a88d48cd65ec9 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 15 May 2025 23:52:31 -0700 Subject: [PATCH 50/83] move some issues from skip list to known issues report Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 230 +++---------------------------- 1 file changed, 16 insertions(+), 214 deletions(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index d99dd2bd0..05942858c 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -17,112 +17,15 @@ "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", "test_checkpoint_submodule_use_reentrant_False_xpu", ), - "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( - # fsdp accuracy gaps - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_ddp_parity_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None, "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, - "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": ( - # fsdp accuracy gaps - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_bf16_hook_has_wrapping_False_sharding_strategy0", - "test_bf16_hook_has_wrapping_False_sharding_strategy1", - "test_bf16_hook_has_wrapping_False_sharding_strategy2", - "test_bf16_hook_has_wrapping_True_sharding_strategy0", - "test_bf16_hook_has_wrapping_True_sharding_strategy1", - "test_bf16_hook_has_wrapping_True_sharding_strategy2", - "test_fp16_hook_has_wrapping_False_sharding_strategy1", - "test_fp16_hook_has_wrapping_False_sharding_strategy2", - "test_fp16_hook_has_wrapping_True_sharding_strategy0", - "test_fp16_hook_has_wrapping_True_sharding_strategy1", - "test_fp16_hook_has_wrapping_True_sharding_strategy2", - ), - "../../../../test/distributed/fsdp/test_fsdp_core.py": ( - # fsdp accuracy gaps - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_transformer_no_grad_mixed_precision_True_xpu", - "test_delayed_optim_step_offload_false_no_shard_xpu", - "test_delayed_optim_step_offload_false_none_xpu", - "test_delayed_optim_step_offload_false_shard_grad_op_xpu", - "test_delayed_optim_step_offload_true_none_xpu", - "test_delayed_optim_step_offload_true_shard_grad_op_xpu", - "test_delayed_reduce_scatter_offload_false_no_shard_xpu", - "test_delayed_reduce_scatter_offload_false_none_xpu", - "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu", - "test_delayed_reduce_scatter_offload_true_none_xpu", - "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu", - "test_mixture_of_experts_offload_false_no_shard_xpu", - "test_mixture_of_experts_offload_false_none_xpu", - "test_mixture_of_experts_offload_false_shard_grad_op_xpu", - "test_mixture_of_experts_offload_true_none_xpu", - "test_mixture_of_experts_offload_true_shard_grad_op_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu", - "test_nested_always_wrap_model_offload_false_no_shard_xpu", - "test_nested_always_wrap_model_offload_false_none_xpu", - "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu", - "test_nested_always_wrap_model_offload_true_none_xpu", - "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu", - "test_nested_wrapped_model_offload_false_no_shard_xpu", - "test_nested_wrapped_model_offload_false_none_xpu", - "test_nested_wrapped_model_offload_false_shard_grad_op_xpu", - "test_nested_wrapped_model_offload_true_none_xpu", - "test_nested_wrapped_model_offload_true_shard_grad_op_xpu", - "test_transformer_offload_false_none_xpu", - "test_transformer_offload_false_shard_grad_op_xpu", - "test_transformer_offload_true_none_xpu", - "test_transformer_offload_true_shard_grad_op_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": None, "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, - "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( - # fsdp accuracy gaps - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_hooks_multi_traversal_xpu", - "test_parity_with_ddp_xpu", - "test_parity_with_non_frozen_fsdp_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None, "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, - "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": ( - # fsdp accuracy gaps - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True ", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", - "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", - ), + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None, "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, @@ -136,53 +39,9 @@ "test_fsdp_optimizer_overlap", ), "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, - "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": ( - # fsdp accuracy gaps - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_multi_forward_cpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, - "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( - # fsdp accuracy gaps - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_flatten_sharded_optim_state_dict_nested", - "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", - "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True", - "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_False", - "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_True", - "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False", - "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True", - "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_False", - "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_True", - "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", - "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True", - "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False", - "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True", - "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_False", - "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_True", - "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_False", - "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_True", - "test_rekey_optim_state_dict_to_names", - "test_scatter_full_optim_state_dict_nested_halve_world_size", - "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False", - "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True", - "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False", - "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True", - "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False", - "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True", - "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False", - "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True", - "test_shard_full_optim_state_dict_nested_halve_world_size", - "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False", - "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True", - "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False", - "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True", - "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False", - "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True", - "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False", - "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True", - "test_use_orig_params", - ), + "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": None, # Performance check, skip # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( # # fsdp accuracy gaps @@ -191,60 +50,20 @@ # "test_forward_overlap_xpu", # ), "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, - "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": ( - # fsdp accuracy gaps - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none", - "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none", - "test_fsdp_ddp_parity_with_grad_scaler_offload_true_none_none_none", - "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none", - ), - "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": ( - # fsdp accuracy gaps - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_state_dict_save_load_flow_state_dict_type_local_state_dict", - "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict", - "test_state_dict_save_load_flow_state_dict_type_state_dict", - ), + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, - "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": ( - # fsdp accuracy gaps - # https://github.com/intel/torch-xpu-ops/issues/1504 - "test_diff_hyperparams_sharding_strategy_str_full_shard", - "test_diff_hyperparams_sharding_strategy_str_no_shard", - "test_diff_hyperparams_sharding_strategy_str_shard_grad_op", - "test_no_sync_correctness", - ), + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None, "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, "../../../../test/distributed/fsdp/test_shard_utils.py": None, "../../../../test/distributed/fsdp/test_utils.py": None, "../../../../test/distributed/fsdp/test_wrap.py": None, "../../../../test/distributed/test_backends.py": None, "../../../../test/distributed/test_c10d_common.py": None, - "../../../../test/distributed/test_c10d_functional_native.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1508 - # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path - "test_reduce_scatter_tensor_coalesced", - "test_reduce_scatter_tensor_single", - # # https://github.com/intel/torch-xpu-ops/issues/1525 - # # ValueError: trying to initialize the default process group twice! - # "test_inductor_all_gather_into_tensor_coalesced", - # "test_inductor_all_gather_into_tensor_single", - # "test_inductor_all_reduce_coalesced", - # "test_inductor_all_reduce_non_contig_input", - # "test_inductor_all_reduce_single", - # "test_inductor_all_to_all_single", - # "test_inductor_broadcast", - # "test_inductor_inplace_op_on_view", - # "test_inductor_reduce_scatter_tensor_coalesced", - # "test_inductor_reduce_scatter_tensor_single", - # "test_inductor_reuse_buffer_after_inplace_collective", - # "test_ranks_and_tag", - # "test_wait_tensor", - ), + "../../../../test/distributed/test_c10d_functional_native.py": None, "../../../../test/distributed/test_c10d_logger.py": None, "../../../../test/distributed/test_c10d_object_collectives.py": ( # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal! @@ -275,6 +94,7 @@ "test_dynamo_trace_allgather_coalesced", "test_inductor_reduce_scatter_coalesced", "test_inductor_all_gather_coalesced", + "test_reorder_peak_memory", ), "../../../../test/distributed/test_multi_threaded_pg.py": None, "../../../../test/distributed/test_store.py": None, @@ -386,15 +206,9 @@ "../../../../test/distributed/tensor/test_common_rules.py": None, "../../../../test/distributed/tensor/test_dtensor.py": None, "../../../../test/distributed/tensor/test_dtensor_compile.py": None, - "../../../../test/distributed/tensor/test_experimental_ops.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1604 - "test_bernoulli", - ), + "../../../../test/distributed/tensor/test_experimental_ops.py": None, "../../../../test/distributed/tensor/test_init.py": None, "../../../../test/distributed/tensor/test_math_ops.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1604, hang - "test_mean", - "test_nll_loss_and_cross_entropy", # https://jira.devtools.intel.com/browse/MLSL-3625 "test_cumsum", "test_layer_norm_bwd", @@ -435,10 +249,7 @@ "test_unused_forward_module", "test_unused_forward_output", ), - "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1661 - "test_clip_grad_norm_2d", - ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": ( # ValueError: Cannot use ReduceOp.PREMUL_SUM with XCCL # https://github.com/intel/torch-xpu-ops/issues/1571, 2.8 skipped @@ -459,10 +270,7 @@ "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, - "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1605 - "test_fully_shard_training_memory", - ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": ( # https://jira.devtools.intel.com/browse/MLSL-3625 "test_compute_dtype", @@ -473,15 +281,9 @@ # Performance test, should skip "test_fully_shard_training_overlap", ), - "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": ( - # Expected zero exit code but got -9 - # https://github.com/intel/torch-xpu-ops/issues/1663 - "test_dp_state_dict_cpu_offload", - ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None, "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1661 - "test_train_parity_multi_group_unshard_async_op", # checkpointing issue, 2.8 skipped "test_train_parity_with_activation_checkpointing", # https://jira.devtools.intel.com/browse/MLSL-3625 From 7493676b7da640fa37e19b125fe4504bf85e87a4 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 06:01:16 -0700 Subject: [PATCH 51/83] enable fsdp cases based on local branch --- test/xpu/run_distributed_local.py | 63 +++++++++++++++++++++++++++++++ test/xpu/skip_list_dist_local.py | 57 ++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 test/xpu/run_distributed_local.py create mode 100644 test/xpu/skip_list_dist_local.py diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py new file mode 100644 index 000000000..8074b3292 --- /dev/null +++ b/test/xpu/run_distributed_local.py @@ -0,0 +1,63 @@ +import os +import subprocess +import sys + +from skip_list_dist_local import skip_dict +from xpu_test_utils import launch_test + +res = 0 +fail_test = [] + +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log", "r") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(' ') + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split('/')[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ( "XL" in item or "S" in item ): + if len(affinity) == 0: + affinity = str(j-2) + else: + affinity = affinity + ',' + str(j-2) + gpu_dict[i] = affinity + + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) + +# run pytest with skiplist +for key in skip_dict: + skip_list = skip_dict[key] + fail = launch_test(key, skip_list) + res += fail + if fail: + fail_test.append(key) + +if fail_test: + print(",".join(fail_test) + " have failures") + +exit_code = os.WEXITSTATUS(res) +if exit_code == 0: + sys.exit(res) +else: + sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py new file mode 100644 index 000000000..08f90c6b5 --- /dev/null +++ b/test/xpu/skip_list_dist_local.py @@ -0,0 +1,57 @@ +skip_dict = { + "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1536 + #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "test_distributed_checkpoint_state_dict_type0_xpu", + # "test_distributed_checkpoint_state_dict_type1_xpu", + #), + "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, + "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + "test_delayed_optim_step_offload_true_no_shard_xpu", + "test_transformer_no_grad_mixed_precision_True_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None, + "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, + "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, + "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, + "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, + "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + "test_fsdp_zero2_eval_with_prefetch", + ), + "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1537 + "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( + "test_use_orig_params", + ), + # Performance check, skip + #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # "test_forward_overlap", + # "test_forward_overlap_xpu", + #), + "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, + "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, + "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, + "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, + "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": None, + "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_shard_utils.py": None, + "../../../../test/distributed/fsdp/test_utils.py": None, + "../../../../test/distributed/fsdp/test_wrap.py": None, +} From 7d5e6a90c6c61027267ec4ede3c79da753a66420 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 19:46:24 -0700 Subject: [PATCH 52/83] add 2025.0 WA --- test/xpu/run_distributed_local.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 8074b3292..b6a9ef60c 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -8,6 +8,9 @@ res = 0 fail_test = [] +os.environ["CCL_ATL_TRANSPORT"] = "ofi" +os.environ["CCL_SEND"] = "direct" +os.environ["CCL_RECV"] = "direct" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: From 565d86adbad02c52c6168a2f6aa47d0af19fc87a Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 01:27:44 -0700 Subject: [PATCH 53/83] Update distributed UT cases in DDP and PP Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 29 +++++++++- test/xpu/skip_list_dist_local.py | 91 +++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index b6a9ef60c..982f05409 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -2,15 +2,17 @@ import subprocess import sys -from skip_list_dist_local import skip_dict +from skip_list_dist_local import skip_dict, skip_dict_python from xpu_test_utils import launch_test res = 0 +res2 = 0 fail_test = [] os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" os.environ["CCL_RECV"] = "direct" +os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: @@ -48,6 +50,29 @@ print("xpu-smi topology failed") sys.exit(255) +# run python test +def run(test_command): + result = subprocess.run(test_command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + return result + +for key in skip_dict_python: + skip_list = skip_dict_python[key] + test_command = ["python", key] + fail = run(test_command) + if fail.returncode: + for line in fail.stderr.split("\n"): + if "FAIL: " in line: + is_error = True + for skip_case in skip_list: + if skip_case in line: + print("Skiped error: ", key + " " + skip_case) + is_error = False + if is_error: + res2 += fail.returncode + fail_test.append("".join(key + " " + line)) + # run pytest with skiplist for key in skip_dict: skip_list = skip_dict[key] @@ -61,6 +86,6 @@ exit_code = os.WEXITSTATUS(res) if exit_code == 0: - sys.exit(res) + sys.exit(res2) else: sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 08f90c6b5..d65b7aee6 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -54,4 +54,95 @@ "../../../../test/distributed/fsdp/test_shard_utils.py": None, "../../../../test/distributed/fsdp/test_utils.py": None, "../../../../test/distributed/fsdp/test_wrap.py": None, + "../../../../test/distributed/test_backends.py": None, + "../../../../test/distributed/test_c10d_common.py": None, + "../../../../test/distributed/test_c10d_functional_native.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + "test_reduce_scatter_tensor_coalesced", + "test_reduce_scatter_tensor_single", + # https://github.com/intel/torch-xpu-ops/issues/1525 + # ValueError: trying to initialize the default process group twice! + "test_inductor_all_gather_into_tensor_coalesced", + "test_inductor_all_gather_into_tensor_single", + "test_inductor_all_reduce_coalesced", + "test_inductor_all_reduce_non_contig_input", + "test_inductor_all_reduce_single", + "test_inductor_all_to_all_single", + "test_inductor_broadcast", + "test_inductor_inplace_op_on_view", + "test_inductor_reduce_scatter_tensor_coalesced", + "test_inductor_reduce_scatter_tensor_single", + "test_inductor_reuse_buffer_after_inplace_collective", + "test_ranks_and_tag", + "test_wait_tensor", + ), + "../../../../test/distributed/test_c10d_logger.py": None, + "../../../../test/distributed/test_c10d_object_collectives.py": ( + # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_gather_object_cpu", + "test_gather_object_xpu", + "test_gather_object_list_cpu", + "test_gather_object_list_xpu", + ), + "../../../../test/distributed/test_compute_comm_reordering.py": None, + "../../../../test/distributed/test_control_collectives.py": None, + "../../../../test/distributed/test_device_mesh.py": None, + "../../../../test/distributed/test_dynamo_distributed.py": ( + # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' + "test_asymmetric_compilation", + "test_asymmetric_compilation_with_fx_cache", + # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. + "test_compiled_flex_attention_full_model_ddp", + "test_compiled_flex_attention_local_ddp", + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # https://github.com/intel/torch-xpu-ops/issues/1527 + "test_compiler_collectives_automatic_dynamic_scalar", + "test_compiler_collectives_automatic_dynamic_speculation_divergence", + "test_compiler_collectives_automatic_dynamic_tensor", + "test_compiler_collectives_dim_mismatch", + "test_compiler_collectives_graph_break_empty_graph_still_collective", + "test_compiler_collectives_missing_source", + "test_compiler_collectives_scalar_missing_source", + "test_compiler_collectives_type_mismatch", + "test_ddp_activation_checkpointing", + "test_ddp_baseline_aot_eager_multiprocess", + "test_fsdp_activation_checkpointing", + "test_fsdp_aot_eager", + "test_fsdp_inductor", + "test_fsdp_setattr", + "test_fsdp_unspecialized_forced_getattr_inline", + "test_fsdp_unspecialized_forced_getattr_no_inline", + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_get_pg_attr", + ), + "../../../../test/distributed/test_fake_pg.py": None, + "../../../../test/distributed/test_functional_api.py": ( + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_tracing_xpu", + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu" + ), + "../../../../test/distributed/test_multi_threaded_pg.py": ( + # oneccl not support multi-threaded well, so skip it first. + "test_bwd_sees_fwd_pg", + ), + "../../../../test/distributed/test_store.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/pipelining/test_backward.py": None, + "../../../../test/distributed/pipelining/test_microbatch.py": None, + "../../../../test/distributed/pipelining/test_pipe.py": None, + "../../../../test/distributed/pipelining/test_schedule.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, + "../../../../test/distributed/pipelining/test_unflatten.py": None, +} + +skip_dict_python = { + "distributed/test_c10d_ops_xccl.py": None, + "distributed/test_c10d_xccl.py": None, + "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_stage.py": None, } From 9d0ddfe82f912947d0cc0e95a0f1d640c04f80c0 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 01:36:16 -0700 Subject: [PATCH 54/83] Fixed pylint error Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 7 +++---- test/xpu/skip_list_dist_local.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 982f05409..a5f0c8098 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -11,7 +11,7 @@ os.environ["CCL_ATL_TRANSPORT"] = "ofi" os.environ["CCL_SEND"] = "direct" -os.environ["CCL_RECV"] = "direct" +os.environ["CCL_RECV"] = "direct" os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") @@ -36,13 +36,12 @@ else: affinity = affinity + ',' + str(j-2) gpu_dict[i] = affinity - - + max_affinity = "" for key, value in gpu_dict.items(): if len(value) > len(max_affinity): max_affinity = value - + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index d65b7aee6..6ce62b8ca 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -96,7 +96,7 @@ # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. "test_compiled_flex_attention_full_model_ddp", "test_compiled_flex_attention_local_ddp", - # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ # https://github.com/intel/torch-xpu-ops/issues/1527 "test_compiler_collectives_automatic_dynamic_scalar", "test_compiler_collectives_automatic_dynamic_speculation_divergence", @@ -131,13 +131,13 @@ ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, "../../../../test/distributed/pipelining/test_microbatch.py": None, "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, } skip_dict_python = { From 9cf12d50b27aecd1275717613c9a25d33d105029 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 3 Apr 2025 02:01:55 -0700 Subject: [PATCH 55/83] Fixed pylint error Signed-off-by: Cheng, Penghui --- test/xpu/run_distributed_local.py | 38 ++++++++++++++++--------------- test/xpu/skip_list_dist_local.py | 17 +++++++------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index a5f0c8098..d4db4785a 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -17,29 +17,29 @@ ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") if ret == 0: gpu_dict = {} - with open("topology.log", "r") as file: + with open("topology.log") as file: lines = file.readlines() for line in lines: - if "CPU Affinity" in line: - continue - line = line.strip() - if line.startswith("GPU "): - items = line.split(' ') - items = [x for x in items if x] - gpu_id = items[1] - i = gpu_id.split('/')[0] - affinity = "" - for j, item in enumerate(items): - if "SYS" not in item and ( "XL" in item or "S" in item ): - if len(affinity) == 0: - affinity = str(j-2) - else: - affinity = affinity + ',' + str(j-2) - gpu_dict[i] = affinity + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity max_affinity = "" for key, value in gpu_dict.items(): - if len(value) > len(max_affinity): + if len(value) > len(max_affinity): max_affinity = value os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) @@ -49,6 +49,7 @@ print("xpu-smi topology failed") sys.exit(255) + # run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) @@ -56,6 +57,7 @@ def run(test_command): print(result.stderr) return result + for key in skip_dict_python: skip_list = skip_dict_python[key] test_command = ["python", key] diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 6ce62b8ca..0ac46961e 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -1,10 +1,10 @@ skip_dict = { "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, # https://github.com/intel/torch-xpu-ops/issues/1536 - #"../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( # "test_distributed_checkpoint_state_dict_type0_xpu", # "test_distributed_checkpoint_state_dict_type1_xpu", - #), + # ), "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, @@ -37,11 +37,11 @@ "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( "test_use_orig_params", ), - # Performance check, skip - #"../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # Performance check, skip + # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( # "test_forward_overlap", # "test_forward_overlap_xpu", - #), + # ), "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None, "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None, @@ -58,7 +58,7 @@ "../../../../test/distributed/test_c10d_common.py": None, "../../../../test/distributed/test_c10d_functional_native.py": ( # https://github.com/intel/torch-xpu-ops/issues/1508 - #RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path "test_reduce_scatter_tensor_coalesced", "test_reduce_scatter_tensor_single", # https://github.com/intel/torch-xpu-ops/issues/1525 @@ -123,7 +123,7 @@ # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) # https://github.com/intel/torch-xpu-ops/issues/1526 "test_tracing_xpu", - "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu" + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu", ), "../../../../test/distributed/test_multi_threaded_pg.py": ( # oneccl not support multi-threaded well, so skip it first. @@ -131,7 +131,6 @@ ), "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, - "../../../../test/distributed/pipelining/test_backward.py": None, "../../../../test/distributed/pipelining/test_microbatch.py": None, "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, @@ -143,6 +142,6 @@ skip_dict_python = { "distributed/test_c10d_ops_xccl.py": None, "distributed/test_c10d_xccl.py": None, - "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, } From 8705ec54115e50d79d1abf719ba17a54290004ba Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:25:50 +0000 Subject: [PATCH 56/83] add distributed ut in CI --- .github/scripts/ut_result_check.sh | 17 ++-- .github/workflows/_linux_build.yml | 30 ++++++- .github/workflows/_linux_ut.yml | 140 +++++++++++++++++++++++++++++ .github/workflows/pull.yml | 25 ++++++ 4 files changed, 202 insertions(+), 10 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index a666bfa9a..260672ce0 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -134,19 +134,20 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log - grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_xpu_distributed_test_failed.log - compare_and_filter_logs "${ut_suite}"_xpu_distributed_test_failed.log Known_issue.log - if [[ -f "${ut_suite}_xpu_distributed_test_failed_filtered.log" ]]; then - num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed_filtered.log") + +if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then + grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log + compare_and_filter_logs "${ut_suite}"_test_failed.log Known_issue.log + if [[ -f "${ut_suite}_test_failed_filtered.log" ]]; then + num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed_filtered.log") else - num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log") + num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed.log") fi echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite} xpu distributed" echo -e "=========================================================================" - cat "./${ut_suite}_xpu_distributed_test_failed.log" + cat "./${ut_suite}_test_failed.log" ((num_failed=num_failed_xpu_distributed)) if [[ $num_failed -gt 0 ]]; then echo -e "[ERROR] UT ${ut_suite} test Fail" diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index f7381a502..69a843e35 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -74,8 +74,34 @@ jobs: pip install -U pip wheel setuptools - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - with: - path: torch-xpu-ops + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_build || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_build + conda create -n xpu_build python=${{ inputs.python }} cmake=3.28 ninja -y + source activate xpu_build + cd ../ && rm -rf pytorch + pip install requests + if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then + git clone https://github.com/daisyden/pytorch.git pytorch + else + git clone https://github.com/pytorch/pytorch pytorch + fi + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi +>>>>>>> 62e9ff75 (add distributed ut in CI) - name: Build Pytorch XPU run: | set -xe diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 7f29d89d3..2fab9fdb1 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -461,3 +461,143 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [[ ${{ inputs.abi }} == '0' ]]; then + export _GLIBCXX_USE_CXX11_ABI=0 + else + export _GLIBCXX_USE_CXX11_ABI=1 + fi + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 02cd96b5b..24f9e2b1e 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -66,6 +66,31 @@ jobs: pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu + + preci-linux-build-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-lint-check + permissions: + issues: write + uses: ./.github/workflows/_linux_build.yml + with: + pytorch: distributed_2.8 + runner: pvc_e2e + + preci-ut-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-linux-build-distributed + uses: ./.github/workflows/_linux_ut.yml + with: + pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} + ut: pytorch_distributed + runner: pvc_e2e Inductor-XPU-E2E-CI-Tests: name: preci-linux / e2e_test From 4b94ee22404a692a3bd8016d7ede32dbf186cbf8 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:52:17 +0000 Subject: [PATCH 57/83] update if condition --- .github/workflows/_linux_build.yml | 26 ++++++++++++++------------ .github/workflows/_linux_ut.yml | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 69a843e35..d8c7eb2c5 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -86,20 +86,22 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - fi - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi fi >>>>>>> 62e9ff75 (add distributed ut in CI) - name: Build Pytorch XPU diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 2fab9fdb1..12cf823f5 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -44,7 +44,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' }} + if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }} timeout-minutes: 900 env: GH_TOKEN: ${{ github.token }} From e52ae48b505a553c7f44fa8814a7e7fe31a4060d Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 23:02:20 +0000 Subject: [PATCH 58/83] keep_torch_xpu_ops --- .github/workflows/pull.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 24f9e2b1e..ee0caaac8 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -78,6 +78,7 @@ jobs: uses: ./.github/workflows/_linux_build.yml with: pytorch: distributed_2.8 + keep_torch_xpu_ops: true runner: pvc_e2e preci-ut-distributed: From 0fc4430b7b512309b7c08109476789cba906a679 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 13:37:10 +0000 Subject: [PATCH 59/83] update pytorch build --- .github/workflows/_linux_build.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index d8c7eb2c5..69a843e35 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -86,22 +86,20 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - fi + fi + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt fi >>>>>>> 62e9ff75 (add distributed ut in CI) - name: Build Pytorch XPU From 45dfc65ece6d109e6b6ac0299975d0931c1afc4b Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Wed, 9 Apr 2025 00:18:27 -0700 Subject: [PATCH 60/83] Enabled UT for test/distributed/tensor Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 0ac46961e..42cdebf19 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -136,7 +136,85 @@ "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": ( + # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter' + # is not currently implemented for the XPU device + # https://github.com/intel/torch-xpu-ops/issues/1547 + "test_dtensor_seq_par_shard_dim_0", + "test_dtensor_seq_par_shard_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......' + # https://github.com/intel/torch-xpu-ops/issues/1548 + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True", + # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......' + # https://github.com/intel/torch-xpu-ops/issues/1549 + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True", + # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1550 + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True", + # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter' + # is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1551 + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2", + ), + "../../../../test/distributed/tensor/parallel/test_tp_examples.py": ( + # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! + # https://github.com/intel/torch-xpu-ops/issues/1555 + "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings", + "test_transformer_training_is_seq_parallel_False_float32", + "test_transformer_training_is_seq_parallel_True_float32", + # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered. + # https://github.com/intel/torch-xpu-ops/issues/1556 + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output", + ), "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_style.py": None, + "../../../../test/distributed/tensor/test_api.py": None, + "../../../../test/distributed/tensor/test_attention.py": None, + "../../../../test/distributed/tensor/test_common_rules.py": None, + "../../../../test/distributed/tensor/test_dtensor.py": None, + "../../../../test/distributed/tensor/test_dtensor_compile.py": None, + "../../../../test/distributed/tensor/test_experimental_ops.py": None, + "../../../../test/distributed/tensor/test_init.py": None, + "../../../../test/distributed/tensor/test_math_ops.py": ( + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_mean", + "test_nll_loss_and_cross_entropy", + ), + "../../../../test/distributed/tensor/test_random_ops.py": None, + "../../../../test/distributed/tensor/test_redistribute.py": None, + "../../../../test/distributed/tensor/test_tensor_ops.py": None, + "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None, } skip_dict_python = { @@ -144,4 +222,5 @@ "distributed/test_c10d_xccl.py": None, "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, } From ebbce64a2d99ed92e2142cbe18c6a8643a0e56a2 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 2 Apr 2025 06:01:16 -0700 Subject: [PATCH 61/83] enable fsdp cases based on local branch --- test/xpu/run_distributed_local.py | 75 ++++++-- test/xpu/skip_list_dist_local.py | 301 +++++++++++++++++++++--------- 2 files changed, 269 insertions(+), 107 deletions(-) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index d4db4785a..dcf079992 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -3,15 +3,12 @@ import sys from skip_list_dist_local import skip_dict, skip_dict_python -from xpu_test_utils import launch_test res = 0 res2 = 0 fail_test = [] +error_log = "" -os.environ["CCL_ATL_TRANSPORT"] = "ofi" -os.environ["CCL_SEND"] = "direct" -os.environ["CCL_RECV"] = "direct" os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") @@ -50,6 +47,9 @@ sys.exit(255) +from xpu_test_utils import launch_test + + # run python test def run(test_command): result = subprocess.run(test_command, capture_output=True, text=True) @@ -59,20 +59,63 @@ def run(test_command): for key in skip_dict_python: - skip_list = skip_dict_python[key] + skip_list = skip_dict_python[key] if skip_dict_python[key] else [] test_command = ["python", key] fail = run(test_command) + num_skipped = 0 + num_err = 0 if fail.returncode: - for line in fail.stderr.split("\n"): - if "FAIL: " in line: - is_error = True - for skip_case in skip_list: - if skip_case in line: - print("Skiped error: ", key + " " + skip_case) - is_error = False - if is_error: - res2 += fail.returncode - fail_test.append("".join(key + " " + line)) + for i, err in enumerate(fail.stderr.split("FAIL: ")): + if i == 0 and len(err) > 0: + error_log += err + continue + is_skipped = False + for skip_case in skip_list: + if skip_case in err: + print("Skipped error: ", key + " " + skip_case) + num_skipped += 1 + is_skipped = True + break + if not is_skipped: + num_err += 1 + res2 += fail.returncode + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ( + "FAILED (failures=" + + str(int(num_errs) - num_skipped) + + f" skipped {num_skipped} cases" + + ")\n" + ) + else: + error_log += line + "\n" + else: + error_log += "FAIL: " + err + else: + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ( + "FAILED (failures=" + + str(int(num_errs) - num_skipped) + + f" skipped {num_skipped} cases" + + ")\n" + ) + + renamed_key = key.replace("../../../../", "").replace("/", "_") + if num_err > 0: + fail_test.append(key) + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(error_log) + else: + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(fail.stdout) + f.write(fail.stderr) # run pytest with skiplist for key in skip_dict: @@ -89,4 +132,4 @@ def run(test_command): if exit_code == 0: sys.exit(res2) else: - sys.exit(exit_code) + sys.exit(exit_code) \ No newline at end of file diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 42cdebf19..58c73eed2 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -7,14 +7,20 @@ # ), "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, - "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( + # Accuracy gap in FSDP checkpoint related UT + # https://github.com/intel/torch-xpu-ops/issues/1666, 2.8 skipped + "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", + "test_checkpoint_submodule_use_reentrant_False_xpu", + ), "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None, "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, - "../../../../test/distributed/fsdp/test_fsdp_core.py": ( - "test_delayed_optim_step_offload_true_no_shard_xpu", - "test_transformer_no_grad_mixed_precision_True_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_core.py": None, "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None, "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None, @@ -28,17 +34,18 @@ "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( - "test_fsdp_zero2_eval_with_prefetch", + # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504, Performance test, should skip + "test_fsdp_optimizer_overlap", ), "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, - # https://github.com/intel/torch-xpu-ops/issues/1537 - "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( - "test_use_orig_params", - ), + "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": None, # Performance check, skip # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504 # "test_forward_overlap", # "test_forward_overlap_xpu", # ), @@ -56,82 +63,55 @@ "../../../../test/distributed/fsdp/test_wrap.py": None, "../../../../test/distributed/test_backends.py": None, "../../../../test/distributed/test_c10d_common.py": None, - "../../../../test/distributed/test_c10d_functional_native.py": ( - # https://github.com/intel/torch-xpu-ops/issues/1508 - # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path - "test_reduce_scatter_tensor_coalesced", - "test_reduce_scatter_tensor_single", - # https://github.com/intel/torch-xpu-ops/issues/1525 - # ValueError: trying to initialize the default process group twice! - "test_inductor_all_gather_into_tensor_coalesced", - "test_inductor_all_gather_into_tensor_single", - "test_inductor_all_reduce_coalesced", - "test_inductor_all_reduce_non_contig_input", - "test_inductor_all_reduce_single", - "test_inductor_all_to_all_single", - "test_inductor_broadcast", - "test_inductor_inplace_op_on_view", - "test_inductor_reduce_scatter_tensor_coalesced", - "test_inductor_reduce_scatter_tensor_single", - "test_inductor_reuse_buffer_after_inplace_collective", - "test_ranks_and_tag", - "test_wait_tensor", - ), + "../../../../test/distributed/test_c10d_functional_native.py": None, "../../../../test/distributed/test_c10d_logger.py": None, "../../../../test/distributed/test_c10d_object_collectives.py": ( - # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds - # https://github.com/intel/torch-xpu-ops/issues/1535 - "test_gather_object_cpu", - "test_gather_object_xpu", - "test_gather_object_list_cpu", - "test_gather_object_list_xpu", + # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal! + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_scatter_object_list_cpu", + "test_scatter_object_list_xpu", ), "../../../../test/distributed/test_compute_comm_reordering.py": None, "../../../../test/distributed/test_control_collectives.py": None, "../../../../test/distributed/test_device_mesh.py": None, "../../../../test/distributed/test_dynamo_distributed.py": ( - # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' - "test_asymmetric_compilation", - "test_asymmetric_compilation_with_fx_cache", - # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. - "test_compiled_flex_attention_full_model_ddp", - "test_compiled_flex_attention_local_ddp", - # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ - # https://github.com/intel/torch-xpu-ops/issues/1527 - "test_compiler_collectives_automatic_dynamic_scalar", - "test_compiler_collectives_automatic_dynamic_speculation_divergence", - "test_compiler_collectives_automatic_dynamic_tensor", - "test_compiler_collectives_dim_mismatch", - "test_compiler_collectives_graph_break_empty_graph_still_collective", - "test_compiler_collectives_missing_source", - "test_compiler_collectives_scalar_missing_source", - "test_compiler_collectives_type_mismatch", - "test_ddp_activation_checkpointing", - "test_ddp_baseline_aot_eager_multiprocess", - "test_fsdp_activation_checkpointing", - "test_fsdp_aot_eager", - "test_fsdp_inductor", + # AssertionError: 'setattr() on Tensor.requires_grad' not found in 'Attempted to call function marked as skipped + # https://github.com/intel/torch-xpu-ops/issues/1667, 2.8 skipped "test_fsdp_setattr", - "test_fsdp_unspecialized_forced_getattr_inline", - "test_fsdp_unspecialized_forced_getattr_no_inline", - # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) - # https://github.com/intel/torch-xpu-ops/issues/1526 - "test_get_pg_attr", ), "../../../../test/distributed/test_fake_pg.py": None, - "../../../../test/distributed/test_functional_api.py": ( - # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) - # https://github.com/intel/torch-xpu-ops/issues/1526 - "test_tracing_xpu", - "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu", - ), - "../../../../test/distributed/test_multi_threaded_pg.py": ( - # oneccl not support multi-threaded well, so skip it first. - "test_bwd_sees_fwd_pg", + "../../../../test/distributed/test_functional_api.py": None, + "../../../../test/distributed/test_inductor_collectives.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1581, 2.8 skipped + # Fatal Python error: Segmentation fault + "test_dynamo_rewrite_dist_all_gather", + "test_dynamo_rewrite_dist_all_gather_list", + "test_dynamo_rewrite_dist_all_gather_args_match", + "test_dynamo_rewrite_dist_reduce_scatter", + "test_dynamo_support_collective_op_with_async_op_False", + "test_dynamo_trace_reduce_scatter_tensor", + "test_dynamo_trace_all_gather_tensor", + "test_dynamo_trace_allgather_coalesced", + "test_inductor_reduce_scatter_coalesced", + "test_inductor_all_gather_coalesced", + "test_reorder_peak_memory", ), + "../../../../test/distributed/test_multi_threaded_pg.py": None, "../../../../test/distributed/test_store.py": None, "../../../../test/distributed/pipelining/test_backward.py": None, + # ( + # # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504 + # "test_stage_backward_weight_multiple_iters_xpu", + # "test_stage_backward_weight_xpu", + # "test_stage_backward_xpu", + # ), "../../../../test/distributed/pipelining/test_microbatch.py": None, + # ( + # # fsdp accuracy gaps + # https://github.com/intel/torch-xpu-ops/issues/1504, need retest with oneccl fix + # "test_chunk_spec_xpu", + # ), "../../../../test/distributed/pipelining/test_pipe.py": None, "../../../../test/distributed/pipelining/test_schedule.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, @@ -139,7 +119,7 @@ "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": ( # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter' # is not currently implemented for the XPU device - # https://github.com/intel/torch-xpu-ops/issues/1547 + # https://github.com/intel/torch-xpu-ops/issues/1547, 2.8 skipped "test_dtensor_seq_par_shard_dim_0", "test_dtensor_seq_par_shard_dim_1", "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0", @@ -148,7 +128,7 @@ "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1", "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2", # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......' - # https://github.com/intel/torch-xpu-ops/issues/1548 + # https://github.com/intel/torch-xpu-ops/issues/1548, 2.8 skipped "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False", "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True", "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False", @@ -156,7 +136,7 @@ "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False", "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True", # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......' - # https://github.com/intel/torch-xpu-ops/issues/1549 + # https://github.com/intel/torch-xpu-ops/issues/1549, 2.8 skipped "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False", "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True", "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False", @@ -164,14 +144,14 @@ "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False", "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True", # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device. - # https://github.com/intel/torch-xpu-ops/issues/1550 + # https://github.com/intel/torch-xpu-ops/issues/1550, 2.8 skipped "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False", "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True", "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False", "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True", # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter' # is not currently implemented for the XPU device. - # https://github.com/intel/torch-xpu-ops/issues/1551 + # https://github.com/intel/torch-xpu-ops/issues/1551, 2.8 skipped "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0", "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1", "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0", @@ -183,8 +163,8 @@ ), "../../../../test/distributed/tensor/parallel/test_tp_examples.py": ( # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! - # https://github.com/intel/torch-xpu-ops/issues/1555 - "test/distributed/tensor/parallel/test_tp_examples.py::DistTensorParallelExampleTest::test_transformer_req_grad_seq_parallel_float32_thaw_all", + # https://github.com/intel/torch-xpu-ops/issues/1555, 2.8 skipped + "test_transformer_req_grad_seq_parallel_float32_thaw_all", "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings", "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings", "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings", @@ -192,13 +172,36 @@ "test_transformer_training_is_seq_parallel_False_float32", "test_transformer_training_is_seq_parallel_True_float32", # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered. - # https://github.com/intel/torch-xpu-ops/issues/1556 + # https://github.com/intel/torch-xpu-ops/issues/1556, 2.8 skipped "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_loss_parallel", + "test_mlp_training_is_seq_parallel_False_recompute_activation_False", + "test_mlp_training_is_seq_parallel_True_recompute_activation_False", + "test_transformer_req_grad_float64_thaw_all", + "test_transformer_training_is_seq_parallel_False_float64", + "test_transformer_training_is_seq_parallel_True_float64", + "test_sequence_parallel_style", ), "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, - "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None, + "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_linear_col_wise_parallel", + "test_parallelize_mlp_with_module_api", + "test_parallelize_mlp_with_module_api_nested", + "test_parallelize_module_multi_wildcard", + "test_parallelize_module_src_data_rank", + "test_parallelize_module_with_digit", + "test_parallelize_module_with_question", + "test_parallelize_module_with_star", + "test_under_devicemesh_context", + ), "../../../../test/distributed/tensor/parallel/test_tp_style.py": None, - "../../../../test/distributed/tensor/test_api.py": None, + "../../../../test/distributed/tensor/test_api.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_distribute_tensor_rank", + "test_distribute_tensor_uneven_sharding", + ), "../../../../test/distributed/tensor/test_attention.py": None, "../../../../test/distributed/tensor/test_common_rules.py": None, "../../../../test/distributed/tensor/test_dtensor.py": None, @@ -206,21 +209,137 @@ "../../../../test/distributed/tensor/test_experimental_ops.py": None, "../../../../test/distributed/tensor/test_init.py": None, "../../../../test/distributed/tensor/test_math_ops.py": ( - # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path - # https://github.com/intel/torch-xpu-ops/issues/1508 - "test_mean", - "test_nll_loss_and_cross_entropy", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_cumsum", + "test_layer_norm_bwd", + "test_layer_norm_bwd_req_grad", + "test_layer_norm_fwd", + "test_linear_op_reductions", + "test_shard0_svd", + "test_softmax_fwd", + "test_topk", ), "../../../../test/distributed/tensor/test_random_ops.py": None, - "../../../../test/distributed/tensor/test_redistribute.py": None, - "../../../../test/distributed/tensor/test_tensor_ops.py": None, - "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None, + "../../../../test/distributed/tensor/test_redistribute.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_redistribute_shard_dim_change", + "test_redistribute_uneven_sharding", + "test_shard_to_replicate_forward_backward", + "test_shard_to_replicate_forward_backward_datatype_conversion", + "test_multi_dim_mesh", + ), + "../../../../test/distributed/tensor/test_tensor_ops.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_aten_contiguous", + "test_gather", + "test_index", + "test_slice", + "test_stack", + "test_where_type_promotion", + ), + "../../../../test/distributed/tensor/experimental/test_register_sharding.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_argmax", + "test_softmax_fwd", + ), + # FSDP2 + "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_nontensor_activations", + "test_unused_forward_module", + "test_unused_forward_output", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": ( + # ValueError: Cannot use ReduceOp.PREMUL_SUM with XCCL + # https://github.com/intel/torch-xpu-ops/issues/1571, 2.8 skipped + "test_set_reduce_scatter_divide_factor", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": ( + # torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised + # https://github.com/intel/torch-xpu-ops/issues/1665, 2.8 skipped + "test_transformer_backend_inductor_fullgraph_True", + "test_nested_fully_shard_backend_inductor_fullgraph_True", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_train_mixed_requires_grad_per_group", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_compute_dtype", + "test_grad_acc_with_reduce_dtype", + "test_reduce_dtype", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": ( + # Performance test, should skip + "test_fully_shard_training_overlap", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": ( + # checkpointing issue, 2.8 skipped + "test_train_parity_with_activation_checkpointing", + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_1f1b_microbatching", + "test_gradient_accumulation", + ), + "../../../../test/distributed/_composable/test_replicate_with_compiler.py": ( + # AssertionError: Tensor-likes are not close! + # https://github.com/intel/torch-xpu-ops/issues/1668, 2.8 skipped + "test_compile_backward_only", + "test_compile_bf16", + "test_compile_fp16", + "test_compile_gpu", + "test_compile_gpu_ac", + ), + "../../../../test/distributed/_shard/test_sharder.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_custom_sharder", + ), + "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None, + "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": { + # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) + # https://github.com/intel/torch-xpu-ops/issues/1617, 2.8 skipped + "test_complete_world_size", + "test_multiple_local_shards", + "test_new_group", + "test_partial_world_size", + "test_grid_sharding", + "test_multiple_local_shards", + "test_new_group", + "test_partial_world_size", + "test_with_rpc_names", + "test_init_from_local_tensor", + # what(): Attempting to send a Tensor with unexpected device type xpu:3 + # https://github.com/intel/torch-xpu-ops/issues/1616, 2.8 skipped + "test_init_from_local_shards", + "test_init_from_local_shards_and_global_metadata", + }, + "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None, + "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None, + "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None, + "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None, + # ( + # # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # # https://github.com/intel/torch-xpu-ops/issues/1508, 2.8 skipped + # "test_tracker_with_activation_checkpointing", + # ), + "../../../../test/distributed/_tools/test_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_memory_tracker.py": None, + "../../../../test/distributed/_tools/test_mod_tracker.py": None, } skip_dict_python = { "distributed/test_c10d_ops_xccl.py": None, "distributed/test_c10d_xccl.py": None, - "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, -} +} \ No newline at end of file From 6e54fb84daa243244f3f3554d3eb4f1615d0a3c5 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:25:50 +0000 Subject: [PATCH 62/83] add distributed ut in CI --- .github/scripts/ut_result_check.sh | 7 +- .github/workflows/_linux_ut.yml | 140 +++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 4 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 260672ce0..cea234d31 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -134,10 +134,9 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi - if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then - grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log - grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log + grep -E "^FAILED" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log compare_and_filter_logs "${ut_suite}"_test_failed.log Known_issue.log if [[ -f "${ut_suite}_test_failed_filtered.log" ]]; then num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed_filtered.log") @@ -155,4 +154,4 @@ if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distribute else echo -e "[PASS] UT ${ut_suite} test Pass" fi -fi +fi \ No newline at end of file diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 12cf823f5..8f13b267a 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -601,3 +601,143 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [[ ${{ inputs.abi }} == '0' ]]; then + export _GLIBCXX_USE_CXX11_ABI=0 + else + export _GLIBCXX_USE_CXX11_ABI=1 + fi + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log From 5659efd4c2654a7abf24f86e6c9795c44a160d6b Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sat, 5 Apr 2025 19:52:17 +0000 Subject: [PATCH 63/83] update if condition --- .github/workflows/_linux_build.yml | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 69a843e35..d8c7eb2c5 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -86,20 +86,22 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - fi - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi fi >>>>>>> 62e9ff75 (add distributed ut in CI) - name: Build Pytorch XPU From 596f231d0117a60c4600002935b4cd57bfb15db1 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 13:37:10 +0000 Subject: [PATCH 64/83] update pytorch build --- .github/workflows/_linux_build.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index d8c7eb2c5..69a843e35 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -86,22 +86,20 @@ jobs: pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - git submodule sync && git submodule update --init --recursive else git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - fi + fi + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt fi >>>>>>> 62e9ff75 (add distributed ut in CI) - name: Build Pytorch XPU From 2b958e1d944f0b1d4c75d23eb767a284304448fc Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 14:55:26 +0000 Subject: [PATCH 65/83] update if condition --- .github/workflows/_linux_ut.yml | 2 +- .github/workflows/pull.yml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 8f13b267a..592ba338f 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -44,7 +44,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' || inputs.ut != 'pytorch_distributed' }} + if: ${{ inputs.ut != 'xpu_distributed' && inputs.ut != 'pytorch_distributed' }} timeout-minutes: 900 env: GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index ee0caaac8..24f9e2b1e 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -78,7 +78,6 @@ jobs: uses: ./.github/workflows/_linux_build.yml with: pytorch: distributed_2.8 - keep_torch_xpu_ops: true runner: pvc_e2e preci-ut-distributed: From 5d9a340656f4ecbbb09ac4c92c44a0a078159d89 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 7 Apr 2025 18:12:34 +0000 Subject: [PATCH 66/83] resolve Artifact name conflict --- .github/workflows/_linux_build.yml | 4 ++-- .github/workflows/_linux_ut.yml | 15 +++++---------- .github/workflows/pull.yml | 4 ++-- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 69a843e35..630debe92 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -189,13 +189,13 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/torch*.whl - name: Upload Build Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/pytorch_*.log - name: Cleanup if: always() diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 592ba338f..74e1b92a4 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -97,7 +97,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -381,7 +381,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -515,18 +515,13 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | source activate xpu_op_${ZE_AFFINITY_MASK} source .github/scripts/env.sh ${{ inputs.pytorch }} pip install mkl-static==2025.0.1 mkl-include==2025.0.1 - if [[ ${{ inputs.abi }} == '0' ]]; then - export _GLIBCXX_USE_CXX11_ABI=0 - else - export _GLIBCXX_USE_CXX11_ABI=1 - fi if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then cd ../pytorch export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} @@ -575,7 +570,7 @@ jobs: echo -e "[ERROR] XCCL is not enabled" exit 1 fi - timeout 10000 python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log cd ${{ github.workspace }} sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - name: UT Test Results Check @@ -599,7 +594,7 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed path: ${{ github.workspace }}/ut_log pytorch_distributed_test: diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 24f9e2b1e..472a48927 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -88,7 +88,7 @@ jobs: needs: preci-linux-build-distributed uses: ./.github/workflows/_linux_ut.yml with: - pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} + pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }} ut: pytorch_distributed runner: pvc_e2e @@ -137,7 +137,7 @@ jobs: - name: Download Pytorch wheel uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ needs.preci-linux-build.outputs.torch_commit_id }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | From 3fd92a3344ad65bfb0901e61ff6027916d4ca31d Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Wed, 9 Apr 2025 23:57:58 -0700 Subject: [PATCH 67/83] add FSDP2 cases, improved check-ut.py for summary, do ZE_AFFINITY_MASK configuration before import torch --- .github/scripts/check-ut.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 5758c4e6d..a66b4b095 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -256,4 +256,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file From 137272cbdc9fbc251b3eb0444f55562abc3d18d0 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Fri, 18 Apr 2025 02:17:54 -0700 Subject: [PATCH 68/83] Skipped error cases Signed-off-by: Cheng, Penghui --- test/xpu/xpu_test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 7b2bac5e6..a191f349b 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1180,4 +1180,4 @@ def launch_test(test_case, skip_list=None, exe_list=None): f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + test_case ) - return os.system(test_command) + return os.system(test_command) \ No newline at end of file From 5cd5b16f24f2de3c555bd0bb90806db07f800abc Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 17:03:10 +0000 Subject: [PATCH 69/83] update ut --- .github/workflows/_linux_ut.yml | 140 -------------------------------- 1 file changed, 140 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 74e1b92a4..cd4b52c60 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -596,143 +596,3 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed path: ${{ github.workspace }}/ut_log - - pytorch_distributed_test: - runs-on: ${{ inputs.runner }} - if: contains(inputs.ut, 'pytorch_distributed') - timeout-minutes: 900 - env: - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Stock Pytorch - run: | - pwd - which conda && conda clean -ay - conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ - rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} - conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y - source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch - pip install requests - git clone https://github.com/daisyden/pytorch.git pytorch - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - fi - fi - - name: Triton Installation - run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../pytorch - TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" - if [ -z ${{ inputs.triton }} ]; then - TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" - else - TRITON_COMMIT_ID="${{ inputs.triton }}" - fi - echo ${TRITON_REPO}@${TRITON_COMMIT_ID} - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" - fi - - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} - uses: actions/download-artifact@v4 - with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} - path: ${{ github.workspace }} - - name: Install Pytorch XPU - run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - source .github/scripts/env.sh ${{ inputs.pytorch }} - pip install mkl-static==2025.0.1 mkl-include==2025.0.1 - if [[ ${{ inputs.abi }} == '0' ]]; then - export _GLIBCXX_USE_CXX11_ABI=0 - else - export _GLIBCXX_USE_CXX11_ABI=1 - fi - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd ../pytorch - export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - pip install -r requirements.txt - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. - else - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ../pytorch - git reset --hard && git checkout ${TORCH_COMMIT_ID} - TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log - cd ${{ github.workspace }} - sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - - name: UT Test Results Check - shell: bash - run: | - function contains() { - contains_status="echo 'Start $2 ...'" - { - [[ $1 =~ (^|,)$2($|,) ]] - } || { - echo "[Warning] $2 is not suppotted type! Skipped!" - contains_status="continue" - } - } - set -xe - echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - cd ${{ github.workspace }}/ut_log/pytorch_distributed - cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ - bash ut_result_check.sh 'pytorch_distributed' - - name: Upload Inductor XPU UT Log - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed - path: ${{ github.workspace }}/ut_log From 0789c991c74a169c22fcc73ae92b4d612c100e1a Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Wed, 16 Apr 2025 02:30:33 -0700 Subject: [PATCH 70/83] add distributed ut summary --- .github/workflows/_linux_ut.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index cd4b52c60..3c90b13de 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -572,7 +572,14 @@ jobs: fi python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log cd ${{ github.workspace }} + mkdir -p ut_log/pytorch_distributed_summary + cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: Distributed UT Test Results Summary + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + pip install junitparser + python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/pytorch_distributed_summary/* >> $GITHUB_STEP_SUMMARY || true - name: UT Test Results Check shell: bash run: | From 4f6bd8d87180124ea08beb145f5a33819854286f Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Wed, 16 Apr 2025 02:32:39 -0700 Subject: [PATCH 71/83] align the path --- .github/workflows/_linux_ut.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 3c90b13de..e0b075bd5 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -564,6 +564,7 @@ jobs: sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope mkdir -p ut_log/pytorch_distributed + mkdir -p ut_log/pytorch_distributed_summary cd ../pytorch/third_party/torch-xpu-ops/test/xpu XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE}}" == 'False' ]]; then @@ -571,9 +572,8 @@ jobs: exit 1 fi python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log - cd ${{ github.workspace }} - mkdir -p ut_log/pytorch_distributed_summary cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary + cd ${{ github.workspace }} sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - name: Distributed UT Test Results Summary run: | From bc65a513587ab35d289fd41c57d9fe3835a2b93e Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 17:50:43 +0000 Subject: [PATCH 72/83] update --- .github/workflows/_linux_ut.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index e0b075bd5..0e4b6521c 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -346,7 +346,7 @@ jobs: rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch + cd ../ && sudo rm -rf pytorch pip install requests git clone https://github.com/pytorch/pytorch pytorch if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then @@ -564,7 +564,6 @@ jobs: sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope mkdir -p ut_log/pytorch_distributed - mkdir -p ut_log/pytorch_distributed_summary cd ../pytorch/third_party/torch-xpu-ops/test/xpu XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE}}" == 'False' ]]; then @@ -572,8 +571,9 @@ jobs: exit 1 fi python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log - cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary cd ${{ github.workspace }} + mkdir -p ut_log/pytorch_distributed_summary + cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - name: Distributed UT Test Results Summary run: | From a86dc57592c9d5c55a494de2154d834bea9c29f9 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 18:01:42 +0000 Subject: [PATCH 73/83] update --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 0e4b6521c..d25f71b04 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -62,7 +62,7 @@ jobs: rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch + cd ../ && sudo rm -rf pytorch pip install requests git clone https://github.com/pytorch/pytorch pytorch if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then From 0cdcce88a68a59850090ed876de50129f71cb04b Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 16 Apr 2025 18:23:09 +0000 Subject: [PATCH 74/83] update --- .github/workflows/_linux_build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 630debe92..e942d990a 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -82,7 +82,7 @@ jobs: rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_build conda create -n xpu_build python=${{ inputs.python }} cmake=3.28 ninja -y source activate xpu_build - cd ../ && rm -rf pytorch + cd ../ && sudo rm -rf pytorch pip install requests if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then git clone https://github.com/daisyden/pytorch.git pytorch From e894f691964da3372fe2345db1fdb37ea2ad1a03 Mon Sep 17 00:00:00 2001 From: "Zhong, Ruijie" Date: Wed, 16 Apr 2025 18:18:08 -0700 Subject: [PATCH 75/83] align the path --- .github/workflows/_linux_ut.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index d25f71b04..ab1f741fe 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -564,16 +564,16 @@ jobs: sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope mkdir -p ut_log/pytorch_distributed + mkdir -p ut_log/pytorch_distributed_summary cd ../pytorch/third_party/torch-xpu-ops/test/xpu XCCL_EANBLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE}}" == 'False' ]]; then echo -e "[ERROR] XCCL is not enabled" exit 1 fi - python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log - cd ${{ github.workspace }} - mkdir -p ut_log/pytorch_distributed_summary + python run_distributed_local.py 2>${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log cp op_ut_with_skip_* ${{ github.workspace }}/ut_log/pytorch_distributed_summary + cd ${{ github.workspace }} sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - name: Distributed UT Test Results Summary run: | From a286091fdf34f8532e18f14a051961ddf595a652 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 16 May 2025 18:17:37 +0000 Subject: [PATCH 76/83] fix yml --- .github/workflows/_linux_build.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index e942d990a..1f520aa6d 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -101,7 +101,6 @@ jobs: # Workaround for torch-xpu-ops ci test sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt fi ->>>>>>> 62e9ff75 (add distributed ut in CI) - name: Build Pytorch XPU run: | set -xe From 97504b41087ecf2381db9a3303469510f4a43e92 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Tue, 20 May 2025 19:40:07 -0700 Subject: [PATCH 77/83] remove invalid case Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 05942858c..13dda8eba 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -181,7 +181,6 @@ "test_transformer_req_grad_float64_thaw_all", "test_transformer_training_is_seq_parallel_False_float64", "test_transformer_training_is_seq_parallel_True_float64", - "test_sequence_parallel_style", ), "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": ( From 5ca55a9f82d2e891adb21f90823f9e1af426f090 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Thu, 22 May 2025 02:39:33 -0700 Subject: [PATCH 78/83] Use python instead of pytest to run test_c10d_functional_native.py Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 13dda8eba..0e773ea1f 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -63,7 +63,6 @@ "../../../../test/distributed/fsdp/test_wrap.py": None, "../../../../test/distributed/test_backends.py": None, "../../../../test/distributed/test_c10d_common.py": None, - "../../../../test/distributed/test_c10d_functional_native.py": None, "../../../../test/distributed/test_c10d_logger.py": None, "../../../../test/distributed/test_c10d_object_collectives.py": ( # RuntimeError: Process 2 exited with error code 10 and exception: ; AssertionError: Scalars are not equal! @@ -338,6 +337,7 @@ skip_dict_python = { "distributed/test_c10d_ops_xccl.py": None, "distributed/test_c10d_xccl.py": None, + "../../../../test/distributed/test_c10d_functional_native.py": None, # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. "../../../../test/distributed/pipelining/test_stage.py": None, "../../../../test/distributed/pipelining/test_transformer.py": None, From 638767040e4e3263ec41b7334891c31ccd763990 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Thu, 22 May 2025 20:29:01 -0700 Subject: [PATCH 79/83] enable libfrabric WA --- test/xpu/run_distributed_local.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py index 94fbfae32..3008a8fc1 100644 --- a/test/xpu/run_distributed_local.py +++ b/test/xpu/run_distributed_local.py @@ -9,6 +9,9 @@ fail_test = [] error_log = "" +# libfabric WA to hang issue +os.environ["FI_PROVIDER"] = "tcp" + os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" # Get the xelink group card affinity ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") From 219de35bf50f27702fdca07e69b8e3ef28a6fc74 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Tue, 27 May 2025 00:02:04 -0700 Subject: [PATCH 80/83] Add accuracy issue to skip list Signed-off-by: Cheng, Penghui --- test/xpu/skip_list_dist_local.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index 0e773ea1f..c2d305c7e 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -72,7 +72,11 @@ ), "../../../../test/distributed/test_compute_comm_reordering.py": None, "../../../../test/distributed/test_control_collectives.py": None, - "../../../../test/distributed/test_device_mesh.py": None, + "../../../../test/distributed/test_device_mesh.py": { + # RuntimeError: Process 1 exited with error code 10 and exception: + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_scatter_1d", + }, "../../../../test/distributed/test_dynamo_distributed.py": ( # AssertionError: 'setattr() on Tensor.requires_grad' not found in 'Attempted to call function marked as skipped # https://github.com/intel/torch-xpu-ops/issues/1667, 2.8 skipped From 92b3cad5e4103c18ae2a70c52845e55c3ed7ebcf Mon Sep 17 00:00:00 2001 From: xiangdong <40376367+zxd1997066@users.noreply.github.com> Date: Sun, 1 Jun 2025 02:22:42 +0800 Subject: [PATCH 81/83] fix skip_list_dist_local.py typo --- test/xpu/skip_list_dist_local.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index c2d305c7e..b06449f00 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -72,11 +72,11 @@ ), "../../../../test/distributed/test_compute_comm_reordering.py": None, "../../../../test/distributed/test_control_collectives.py": None, - "../../../../test/distributed/test_device_mesh.py": { + "../../../../test/distributed/test_device_mesh.py": ( # RuntimeError: Process 1 exited with error code 10 and exception: # https://jira.devtools.intel.com/browse/MLSL-3625 "test_scatter_1d", - }, + ), "../../../../test/distributed/test_dynamo_distributed.py": ( # AssertionError: 'setattr() on Tensor.requires_grad' not found in 'Attempted to call function marked as skipped # https://github.com/intel/torch-xpu-ops/issues/1667, 2.8 skipped @@ -306,7 +306,7 @@ "test_custom_sharder", ), "../../../../test/distributed/_shard/sharded_tensor/test_logger.py": None, - "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": { + "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor.py": ( # RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259) # https://github.com/intel/torch-xpu-ops/issues/1617, 2.8 skipped "test_complete_world_size", @@ -323,7 +323,7 @@ # https://github.com/intel/torch-xpu-ops/issues/1616, 2.8 skipped "test_init_from_local_shards", "test_init_from_local_shards_and_global_metadata", - }, + ), "../../../../test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py": None, "../../../../test/distributed/_shard/sharding_plan/test_sharding_plan.py": None, "../../../../test/distributed/_shard/sharding_spec/test_sharding_spec.py": None, From fa10de24bda0b0b9900317d166f889888780dc29 Mon Sep 17 00:00:00 2001 From: xiangdong <40376367+zxd1997066@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:53:26 +0800 Subject: [PATCH 82/83] improve ut_result_check.sh distributed part --- .github/scripts/ut_result_check.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 51fb13a12..07dad4c7b 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -134,8 +134,9 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then fi fi if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then - grep -E "^FAILED" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log - grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log + grep -E "^FAILED" "${ut_suite}"_test.log | awk '{print $3}' > ./"${ut_suite}"_test_failed.log + # grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_test_failed.log + sed -i '/^[^.]\+/d' ./"${ut_suite}"_test_failed.log compare_and_filter_logs "${ut_suite}"_test_failed.log Known_issue.log if [[ -f "${ut_suite}_test_failed_filtered.log" ]]; then num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_test_failed_filtered.log") @@ -153,4 +154,4 @@ if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distribute else echo -e "[PASS] UT ${ut_suite} test Pass" fi -fi \ No newline at end of file +fi From 24a5faf39417f51d489dad5323223be9f59986f1 Mon Sep 17 00:00:00 2001 From: xiangdong <40376367+zxd1997066@users.noreply.github.com> Date: Sun, 29 Jun 2025 19:21:33 +0800 Subject: [PATCH 83/83] skip two new cases which do not have related environment variable --- test/xpu/skip_list_dist_local.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py index b06449f00..5b2460df1 100644 --- a/test/xpu/skip_list_dist_local.py +++ b/test/xpu/skip_list_dist_local.py @@ -256,6 +256,9 @@ # ValueError: Cannot use ReduceOp.PREMUL_SUM with XCCL # https://github.com/intel/torch-xpu-ops/issues/1571, 2.8 skipped "test_set_reduce_scatter_divide_factor", + # NO related environment variable on XPU + "test_fully_shard_force_sum_both_reductions", + "test_fully_shard_force_sum_reduce_scatter", ), "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": ( # torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised