Skip to content

Commit c1094a0

Browse files
authored
[SYCL] Fix use_root_sync handling (#17739)
Address issue #16451 , where property `use_root_sync` is not processed properly. Also revised `sycl/test-e2e/GroupAlgorithm/root_group.cpp` to not use the deprecated version of `parallel_for`. (Which was previously blocked by this issue about `use_root_sync`). Also here's some explanation for the change in `handler.hpp`: This is where the previous code doesn't handle `use_root_sync` correctly: `processLaunchProperties` will be called twice, first for the property list returned by the kernel functor's `get(properties_tag)` method, and then for `Props` that is passed in as a parameter to `parallel_for`. Therefore, if the `get(properties_tag)` method specifies `use_root_sync` and `Props` is empty or doesn't contain `use_root_sync`, what will be done is: - first, the property list returned by the kernel functor's `get(properties_tag)` method get processed. And since it contains `use_root_sync`, `setKernelIsCooperative(true)` is called; - then, the property list `Props` that is passed in as a parameter to `parallel_for` get processed. And since it doesn't contain `use_root_sync` (actually for the non-deprecated variants of `parallel_for`, `Props` should always be an empty property list), `setKernelIsCooperative(**false**)` is called And thus in the end the `MKernelIsCooperative` flag will be set to false, while it actually should be true. Revising the code like this solve the problem. Also `MKernelIsCooperative` is false by default, so we don't need to worry if `setKernelIsCooperative` is not called. --------- Signed-off-by: Hu, Peisen <[email protected]>
1 parent e69e779 commit c1094a0

File tree

2 files changed

+33
-25
lines changed

2 files changed

+33
-25
lines changed

sycl/include/sycl/handler.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -829,7 +829,9 @@ class __SYCL_EXPORT handler {
829829

830830
constexpr bool UsesRootSync = PropertiesT::template has_property<
831831
sycl::ext::oneapi::experimental::use_root_sync_key>();
832-
setKernelIsCooperative(UsesRootSync);
832+
if (UsesRootSync) {
833+
setKernelIsCooperative(UsesRootSync);
834+
}
833835
if constexpr (PropertiesT::template has_property<
834836
sycl::ext::oneapi::experimental::
835837
work_group_progress_key>()) {

sycl/test-e2e/GroupAlgorithm/root_group.cpp

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,7 @@
22
// XFAIL: (opencl && !cpu && !accelerator)
33
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14641
44

5-
// TODO: Currently using the -Wno-deprecated-declarations flag due to issue
6-
// https://github.com/intel/llvm/issues/16451. Rewrite testRootGroup() amd
7-
// remove the flag once the issue is resolved.
8-
// RUN: %{build} -I . -o %t.out -Wno-deprecated-declarations %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
5+
// RUN: %{build} -I . -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
96
// RUN: %{run} %t.out
107

118
// Disabled temporarily while investigation into the failure is ongoing.
@@ -63,6 +60,34 @@ void testQueriesAndProperties() {
6360
check_max_num_work_group_sync(maxWGsWithLimits);
6461
}
6562

63+
template <typename T> struct TestKernel1 {
64+
T m_data;
65+
TestKernel1(T &data_) : m_data(data_) {}
66+
void operator()(sycl::nd_item<1> it) const {
67+
volatile float X = 1.0f;
68+
volatile float Y = 1.0f;
69+
auto root = it.ext_oneapi_get_root_group();
70+
m_data[root.get_local_id()] = root.get_local_id();
71+
sycl::group_barrier(root);
72+
// Delay half of the workgroups with extra work to check that the barrier
73+
// synchronizes the whole device.
74+
if (it.get_group(0) % 2 == 0) {
75+
X += sycl::sin(X);
76+
Y += sycl::cos(Y);
77+
}
78+
root = sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>();
79+
int sum = m_data[root.get_local_id()] +
80+
m_data[root.get_local_range() - root.get_local_id() - 1];
81+
sycl::group_barrier(root);
82+
m_data[root.get_local_id()] = sum;
83+
}
84+
auto get(sycl::ext::oneapi::experimental::properties_tag) const {
85+
return sycl::ext::oneapi::experimental::properties{
86+
sycl::ext::oneapi::experimental::use_root_sync};
87+
;
88+
}
89+
};
90+
6691
void testRootGroup() {
6792
sycl::queue q;
6893
const auto bundle =
@@ -79,26 +104,7 @@ void testRootGroup() {
79104
const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
80105
q.submit([&](sycl::handler &h) {
81106
sycl::accessor data{dataBuf, h};
82-
h.parallel_for<
83-
class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
84-
volatile float X = 1.0f;
85-
volatile float Y = 1.0f;
86-
auto root = it.ext_oneapi_get_root_group();
87-
data[root.get_local_id()] = root.get_local_id();
88-
sycl::group_barrier(root);
89-
// Delay half of the workgroups with extra work to check that the barrier
90-
// synchronizes the whole device.
91-
if (it.get_group(0) % 2 == 0) {
92-
X += sycl::sin(X);
93-
Y += sycl::cos(Y);
94-
}
95-
root =
96-
sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>();
97-
int sum = data[root.get_local_id()] +
98-
data[root.get_local_range() - root.get_local_id() - 1];
99-
sycl::group_barrier(root);
100-
data[root.get_local_id()] = sum;
101-
});
107+
h.parallel_for<class RootGroupKernel>(range, TestKernel1(data));
102108
});
103109
sycl::host_accessor data{dataBuf};
104110
const int workItemCount = static_cast<int>(range.get_global_range().size());

0 commit comments

Comments
 (0)