[SYCL] Fix use_root_sync handling (#17739)

HPS-1 · web-flow · commit c1094a0575e2 · 2025-04-04T14:55:08.000+01:00
Address issue #16451 , where property `use_root_sync` is not processed properly. Also revised `sycl/test-e2e/GroupAlgorithm/root_group.cpp` to not use the deprecated version of `parallel_for`. (Which was previously blocked by this issue about `use_root_sync`). Also here's some explanation for the change in `handler.hpp`: This is where the previous code doesn't handle `use_root_sync` correctly: `processLaunchProperties` will be called twice, first for the property list returned by the kernel functor's `get(properties_tag)` method, and then for `Props` that is passed in as a parameter to `parallel_for`. Therefore, if the `get(properties_tag)` method specifies `use_root_sync` and `Props` is empty or doesn't contain `use_root_sync`, what will be done is: - first, the property list returned by the kernel functor's `get(properties_tag)` method get processed. And since it contains `use_root_sync`, `setKernelIsCooperative(true)` is called; - then, the property list `Props` that is passed in as a parameter to `parallel_for` get processed. And since it doesn't contain `use_root_sync` (actually for the non-deprecated variants of `parallel_for`, `Props` should always be an empty property list), `setKernelIsCooperative(**false**)` is called And thus in the end the `MKernelIsCooperative` flag will be set to false, while it actually should be true. Revising the code like this solve the problem. Also `MKernelIsCooperative` is false by default, so we don't need to worry if `setKernelIsCooperative` is not called. --------- Signed-off-by: Hu, Peisen <peisen.hu@intel.com>
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
@@ -829,7 +829,9 @@ class __SYCL_EXPORT handler {
 
     constexpr bool UsesRootSync = PropertiesT::template has_property<
         sycl::ext::oneapi::experimental::use_root_sync_key>();
-    setKernelIsCooperative(UsesRootSync);
+    if (UsesRootSync) {
+      setKernelIsCooperative(UsesRootSync);
+    }
     if constexpr (PropertiesT::template has_property<
                       sycl::ext::oneapi::experimental::
                           work_group_progress_key>()) {
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -2,10 +2,7 @@
 // XFAIL: (opencl && !cpu && !accelerator)
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/14641
 
-// TODO: Currently using the -Wno-deprecated-declarations flag due to issue
-// https://github.com/intel/llvm/issues/16451. Rewrite testRootGroup() amd
-// remove the flag once the issue is resolved.
-// RUN: %{build} -I . -o %t.out -Wno-deprecated-declarations %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
+// RUN: %{build} -I . -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
 // RUN: %{run} %t.out
 
 // Disabled temporarily while investigation into the failure is ongoing.
@@ -63,6 +60,34 @@ void testQueriesAndProperties() {
   check_max_num_work_group_sync(maxWGsWithLimits);
 }
 
+template <typename T> struct TestKernel1 {
+  T m_data;
+  TestKernel1(T &data_) : m_data(data_) {}
+  void operator()(sycl::nd_item<1> it) const {
+    volatile float X = 1.0f;
+    volatile float Y = 1.0f;
+    auto root = it.ext_oneapi_get_root_group();
+    m_data[root.get_local_id()] = root.get_local_id();
+    sycl::group_barrier(root);
+    // Delay half of the workgroups with extra work to check that the barrier
+    // synchronizes the whole device.
+    if (it.get_group(0) % 2 == 0) {
+      X += sycl::sin(X);
+      Y += sycl::cos(Y);
+    }
+    root = sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>();
+    int sum = m_data[root.get_local_id()] +
+              m_data[root.get_local_range() - root.get_local_id() - 1];
+    sycl::group_barrier(root);
+    m_data[root.get_local_id()] = sum;
+  }
+  auto get(sycl::ext::oneapi::experimental::properties_tag) const {
+    return sycl::ext::oneapi::experimental::properties{
+        sycl::ext::oneapi::experimental::use_root_sync};
+    ;
+  }
+};
+
 void testRootGroup() {
   sycl::queue q;
   const auto bundle =
@@ -79,26 +104,7 @@ void testRootGroup() {
   const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
   q.submit([&](sycl::handler &h) {
     sycl::accessor data{dataBuf, h};
-    h.parallel_for<
-        class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
-      volatile float X = 1.0f;
-      volatile float Y = 1.0f;
-      auto root = it.ext_oneapi_get_root_group();
-      data[root.get_local_id()] = root.get_local_id();
-      sycl::group_barrier(root);
-      // Delay half of the workgroups with extra work to check that the barrier
-      // synchronizes the whole device.
-      if (it.get_group(0) % 2 == 0) {
-        X += sycl::sin(X);
-        Y += sycl::cos(Y);
-      }
-      root =
-          sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>();
-      int sum = data[root.get_local_id()] +
-                data[root.get_local_range() - root.get_local_id() - 1];
-      sycl::group_barrier(root);
-      data[root.get_local_id()] = sum;
-    });
+    h.parallel_for<class RootGroupKernel>(range, TestKernel1(data));
   });
   sycl::host_accessor data{dataBuf};
   const int workItemCount = static_cast<int>(range.get_global_range().size());