Skip to content

Commit d3bc93a

Browse files
[SYCL] Fix reductions in preview after #18767 (#18898)
As a matter of fact, I like this even more, because `type_erased_cgfo_ty` is cheap to create/pass around, we have full access to `handler_impl` in `handler.cpp` (unlike `handler.hpp` where it's opaque) and circular includes/type completeness aren't an issue anymore.
1 parent c192ad9 commit d3bc93a

File tree

5 files changed

+84
-74
lines changed

5 files changed

+84
-74
lines changed

sycl/include/sycl/handler.hpp

Lines changed: 11 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -187,11 +187,12 @@ class type_erased_cgfo_ty {
187187

188188
public:
189189
template <class T>
190-
type_erased_cgfo_ty(T &f)
191-
// NOTE: Even if `T` is a pointer to a function, `&f` is a pointer to a
190+
type_erased_cgfo_ty(T &&f)
191+
// NOTE: Even if `f` is a pointer to a function, `&f` is a pointer to a
192192
// pointer to a function and as such can be casted to `void *` (pointer to
193193
// a function cannot be casted).
194-
: object(static_cast<const void *>(&f)), invoker_f(&invoker<T>::call) {}
194+
: object(static_cast<const void *>(&f)),
195+
invoker_f(&invoker<std::remove_reference_t<T>>::call) {}
195196
~type_erased_cgfo_ty() = default;
196197

197198
type_erased_cgfo_ty(const type_erased_cgfo_ty &) = delete;
@@ -3878,14 +3879,6 @@ class HandlerAccess {
38783879
Handler.parallel_for_impl(Range, Props, Kernel);
38793880
}
38803881

3881-
template <typename T, typename> struct dependent {
3882-
using type = T;
3883-
};
3884-
template <typename T>
3885-
using dependent_queue_t = typename dependent<queue, T>::type;
3886-
template <typename T>
3887-
using dependent_handler_t = typename dependent<handler, T>::type;
3888-
38893882
// pre/postProcess are used only for reductions right now, but the
38903883
// abstractions they provide aren't reduction-specific. The main problem they
38913884
// solve is
@@ -3901,71 +3894,16 @@ class HandlerAccess {
39013894
// inside control group function object (lambda above) so we resort to a
39023895
// somewhat hacky way of creating multiple `handler`s and manual finalization
39033896
// of them (instead of the one in `queue::submit`).
3904-
//
3905-
// Overloads with `queue &q` are provided in case the caller has it created
3906-
// already to avoid unnecessary reference count increments associated with
3907-
// `handler::getQueue()`.
3908-
template <class FunctorTy>
3909-
static void preProcess(handler &CGH, dependent_queue_t<FunctorTy> &q,
3910-
FunctorTy Func) {
3911-
bool EventNeeded = !q.is_in_order();
3912-
handler AuxHandler(getSyclObjImpl(q), EventNeeded);
3913-
AuxHandler.copyCodeLoc(CGH);
3914-
std::forward<FunctorTy>(Func)(AuxHandler);
3915-
auto E = AuxHandler.finalize();
3916-
assert(!CGH.MIsFinalized &&
3917-
"Can't do pre-processing if the command has been enqueued already!");
3918-
if (EventNeeded)
3919-
CGH.depends_on(E);
3920-
}
3897+
__SYCL_EXPORT static void preProcess(handler &CGH, type_erased_cgfo_ty F);
3898+
__SYCL_EXPORT static void postProcess(handler &CGH, type_erased_cgfo_ty F);
3899+
39213900
template <class FunctorTy>
3922-
static void preProcess(dependent_handler_t<FunctorTy> &CGH,
3923-
FunctorTy &&Func) {
3924-
preProcess(CGH, CGH.getQueue(), std::forward<FunctorTy>(Func));
3901+
static void preProcess(handler &CGH, FunctorTy &Func) {
3902+
preProcess(CGH, type_erased_cgfo_ty{Func});
39253903
}
39263904
template <class FunctorTy>
3927-
static void postProcess(dependent_handler_t<FunctorTy> &CGH,
3928-
FunctorTy &&Func) {
3929-
// The "hacky" `handler`s manipulation mentioned above and implemented here
3930-
// is far from perfect. A better approach would be
3931-
//
3932-
// bool OrigNeedsEvent = CGH.needsEvent()
3933-
// assert(CGH.not_finalized/enqueued());
3934-
// if (!InOrderQueue)
3935-
// CGH.setNeedsEvent()
3936-
//
3937-
// handler PostProcessHandler(Queue, OrigNeedsEvent)
3938-
// auto E = CGH.finalize(); // enqueue original or current last
3939-
// // post-process
3940-
// if (!InOrder)
3941-
// PostProcessHandler.depends_on(E)
3942-
//
3943-
// swap_impls(CGH, PostProcessHandler)
3944-
// return; // queue::submit finalizes PostProcessHandler and returns its
3945-
// // event if necessary.
3946-
//
3947-
// Still hackier than "real" `queue::submit` but at least somewhat sane.
3948-
// That, however hasn't been tried yet and we have an even hackier approach
3949-
// copied from what's been done in an old reductions implementation before
3950-
// eventless submission work has started. Not sure how feasible the approach
3951-
// above is at this moment.
3952-
3953-
// This `finalize` is wrong (at least logically) if
3954-
// `assert(!CGH.eventNeeded())`
3955-
auto E = CGH.finalize();
3956-
dependent_queue_t<FunctorTy> Queue = CGH.getQueue();
3957-
bool InOrder = Queue.is_in_order();
3958-
// Cannot use `CGH.eventNeeded()` alone as there might be subsequent
3959-
// `postProcess` calls and we cannot address them properly similarly to the
3960-
// `finalize` issue described above. `swap_impls` suggested above might be
3961-
// able to handle this scenario naturally.
3962-
handler AuxHandler(getSyclObjImpl(Queue), CGH.eventNeeded() || !InOrder);
3963-
if (!InOrder)
3964-
AuxHandler.depends_on(E);
3965-
AuxHandler.copyCodeLoc(CGH);
3966-
std::forward<FunctorTy>(Func)(AuxHandler);
3967-
CGH.MLastEvent = AuxHandler.finalize();
3968-
return;
3905+
static void postProcess(handler &CGH, FunctorTy &Func) {
3906+
postProcess(CGH, type_erased_cgfo_ty{Func});
39693907
}
39703908
};
39713909
} // namespace detail

sycl/include/sycl/reduction.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1066,7 +1066,7 @@ class reduction_impl_algo {
10661066
std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
10671067
CGH.addReduction(Counter);
10681068

1069-
HandlerAccess::preProcess(CGH, q,
1069+
HandlerAccess::preProcess(CGH,
10701070
[Counter = Counter.get()](handler &AuxHandler) {
10711071
AuxHandler.memset(Counter, 0, sizeof(int));
10721072
});

sycl/source/handler.cpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2403,5 +2403,73 @@ void handler::copyCodeLoc(const handler &other) {
24032403
queue handler::getQueue() {
24042404
return createSyclObjFromImpl<queue>(impl->get_queue());
24052405
}
2406+
namespace detail {
2407+
__SYCL_EXPORT void HandlerAccess::preProcess(handler &CGH,
2408+
type_erased_cgfo_ty F) {
2409+
queue_impl &Q = CGH.impl->get_queue();
2410+
bool EventNeeded = !Q.isInOrder();
2411+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
2412+
handler_impl HandlerImpl{Q, nullptr, EventNeeded};
2413+
handler AuxHandler{HandlerImpl};
2414+
#else
2415+
handler AuxHandler{Q.shared_from_this(), EventNeeded};
2416+
#endif
2417+
AuxHandler.copyCodeLoc(CGH);
2418+
F(AuxHandler);
2419+
auto E = AuxHandler.finalize();
2420+
assert(!CGH.MIsFinalized &&
2421+
"Can't do pre-processing if the command has been enqueued already!");
2422+
if (EventNeeded)
2423+
CGH.depends_on(E);
2424+
}
2425+
__SYCL_EXPORT void HandlerAccess::postProcess(handler &CGH,
2426+
type_erased_cgfo_ty F) {
2427+
// The "hacky" `handler`s manipulation mentioned near the declaration in
2428+
// `handler.hpp` and implemented here is far from perfect. A better approach
2429+
// would be
2430+
//
2431+
// bool OrigNeedsEvent = CGH.needsEvent()
2432+
// assert(CGH.not_finalized/enqueued());
2433+
// if (!InOrderQueue)
2434+
// CGH.setNeedsEvent()
2435+
//
2436+
// handler PostProcessHandler(Queue, OrigNeedsEvent)
2437+
// auto E = CGH.finalize(); // enqueue original or current last
2438+
// // post-process
2439+
// if (!InOrder)
2440+
// PostProcessHandler.depends_on(E)
2441+
//
2442+
// swap_impls(CGH, PostProcessHandler)
2443+
// return; // queue::submit finalizes PostProcessHandler and returns its
2444+
// // event if necessary.
2445+
//
2446+
// Still hackier than "real" `queue::submit` but at least somewhat sane.
2447+
// That, however hasn't been tried yet and we have an even hackier approach
2448+
// copied from what's been done in an old reductions implementation before
2449+
// eventless submission work has started. Not sure how feasible the approach
2450+
// above is at this moment.
2451+
2452+
// This `finalize` is wrong (at least logically) if
2453+
// `assert(!CGH.eventNeeded())`
2454+
auto E = CGH.finalize();
2455+
queue_impl &Q = CGH.impl->get_queue();
2456+
bool InOrder = Q.isInOrder();
2457+
// Cannot use `CGH.eventNeeded()` alone as there might be subsequent
2458+
// `postProcess` calls and we cannot address them properly similarly to the
2459+
// `finalize` issue described above. `swap_impls` suggested above might be
2460+
// able to handle this scenario naturally.
2461+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
2462+
handler_impl HandlerImpl{Q, nullptr, CGH.eventNeeded() || !InOrder};
2463+
handler AuxHandler{HandlerImpl};
2464+
#else
2465+
handler AuxHandler{Q.shared_from_this(), CGH.eventNeeded() || !InOrder};
2466+
#endif
2467+
if (!InOrder)
2468+
AuxHandler.depends_on(E);
2469+
AuxHandler.copyCodeLoc(CGH);
2470+
F(AuxHandler);
2471+
CGH.MLastEvent = AuxHandler.finalize();
2472+
}
2473+
} // namespace detail
24062474
} // namespace _V1
24072475
} // namespace sycl

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3260,6 +3260,8 @@ _ZN4sycl3_V16detail12buffer_plainC2EmmRKNS0_13property_listESt10unique_ptrINS1_1
32603260
_ZN4sycl3_V16detail12compile_implERKNS0_13kernel_bundleILNS0_12bundle_stateE0EEERKSt6vectorINS0_6deviceESaIS8_EERKNS0_13property_listE
32613261
_ZN4sycl3_V16detail12isOutOfRangeENS0_3vecIiLi4EEENS0_15addressing_modeENS0_5rangeILi3EEE
32623262
_ZN4sycl3_V16detail12make_contextEmRKSt8functionIFvNS0_14exception_listEEENS0_7backendEbRKSt6vectorINS0_6deviceESaISA_EE
3263+
_ZN4sycl3_V16detail13HandlerAccess10preProcessERNS0_7handlerENS1_19type_erased_cgfo_tyE
3264+
_ZN4sycl3_V16detail13HandlerAccess11postProcessERNS0_7handlerENS1_19type_erased_cgfo_tyE
32633265
_ZN4sycl3_V16detail13host_pipe_map3addEPKvPKc
32643266
_ZN4sycl3_V16detail13lgamma_r_implENS1_9half_impl4halfEPi
32653267
_ZN4sycl3_V16detail13lgamma_r_implEdPi

sycl/test/abi/sycl_symbols_windows.dump

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4332,6 +4332,8 @@
43324332
?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_KAEBUimage_descriptor@12345@AEBVqueue@45@@Z
43334333
?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_K_K1IAEBVdevice@45@AEBVcontext@45@@Z
43344334
?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_K_K1IAEBVqueue@45@@Z
4335+
?postProcess@HandlerAccess@detail@_V1@sycl@@SAXAEAVhandler@34@Vtype_erased_cgfo_ty@234@@Z
4336+
?preProcess@HandlerAccess@detail@_V1@sycl@@SAXAEAVhandler@34@Vtype_erased_cgfo_ty@234@@Z
43354337
?prefetch@handler@_V1@sycl@@QEAAXPEBX_K@Z
43364338
?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KAEBUcode_location@detail@23@@Z
43374339
?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z

0 commit comments

Comments
 (0)