@@ -187,11 +187,12 @@ class type_erased_cgfo_ty {
187
187
188
188
public:
189
189
template <class T >
190
- type_erased_cgfo_ty (T &f)
191
- // NOTE: Even if `T ` is a pointer to a function, `&f` is a pointer to a
190
+ type_erased_cgfo_ty (T && f)
191
+ // NOTE: Even if `f ` is a pointer to a function, `&f` is a pointer to a
192
192
// pointer to a function and as such can be casted to `void *` (pointer to
193
193
// a function cannot be casted).
194
- : object(static_cast <const void *>(&f)), invoker_f(&invoker<T>::call) {}
194
+ : object(static_cast <const void *>(&f)),
195
+ invoker_f (&invoker<std::remove_reference_t <T>>::call) {}
195
196
~type_erased_cgfo_ty () = default ;
196
197
197
198
type_erased_cgfo_ty (const type_erased_cgfo_ty &) = delete;
@@ -3878,14 +3879,6 @@ class HandlerAccess {
3878
3879
Handler.parallel_for_impl (Range, Props, Kernel);
3879
3880
}
3880
3881
3881
- template <typename T, typename > struct dependent {
3882
- using type = T;
3883
- };
3884
- template <typename T>
3885
- using dependent_queue_t = typename dependent<queue, T>::type;
3886
- template <typename T>
3887
- using dependent_handler_t = typename dependent<handler, T>::type;
3888
-
3889
3882
// pre/postProcess are used only for reductions right now, but the
3890
3883
// abstractions they provide aren't reduction-specific. The main problem they
3891
3884
// solve is
@@ -3901,71 +3894,16 @@ class HandlerAccess {
3901
3894
// inside control group function object (lambda above) so we resort to a
3902
3895
// somewhat hacky way of creating multiple `handler`s and manual finalization
3903
3896
// of them (instead of the one in `queue::submit`).
3904
- //
3905
- // Overloads with `queue &q` are provided in case the caller has it created
3906
- // already to avoid unnecessary reference count increments associated with
3907
- // `handler::getQueue()`.
3908
- template <class FunctorTy >
3909
- static void preProcess (handler &CGH, dependent_queue_t <FunctorTy> &q,
3910
- FunctorTy Func) {
3911
- bool EventNeeded = !q.is_in_order ();
3912
- handler AuxHandler (getSyclObjImpl (q), EventNeeded);
3913
- AuxHandler.copyCodeLoc (CGH);
3914
- std::forward<FunctorTy>(Func)(AuxHandler);
3915
- auto E = AuxHandler.finalize ();
3916
- assert (!CGH.MIsFinalized &&
3917
- " Can't do pre-processing if the command has been enqueued already!" );
3918
- if (EventNeeded)
3919
- CGH.depends_on (E);
3920
- }
3897
+ __SYCL_EXPORT static void preProcess (handler &CGH, type_erased_cgfo_ty F);
3898
+ __SYCL_EXPORT static void postProcess (handler &CGH, type_erased_cgfo_ty F);
3899
+
3921
3900
template <class FunctorTy >
3922
- static void preProcess (dependent_handler_t <FunctorTy> &CGH,
3923
- FunctorTy &&Func) {
3924
- preProcess (CGH, CGH.getQueue (), std::forward<FunctorTy>(Func));
3901
+ static void preProcess (handler &CGH, FunctorTy &Func) {
3902
+ preProcess (CGH, type_erased_cgfo_ty{Func});
3925
3903
}
3926
3904
template <class FunctorTy >
3927
- static void postProcess (dependent_handler_t <FunctorTy> &CGH,
3928
- FunctorTy &&Func) {
3929
- // The "hacky" `handler`s manipulation mentioned above and implemented here
3930
- // is far from perfect. A better approach would be
3931
- //
3932
- // bool OrigNeedsEvent = CGH.needsEvent()
3933
- // assert(CGH.not_finalized/enqueued());
3934
- // if (!InOrderQueue)
3935
- // CGH.setNeedsEvent()
3936
- //
3937
- // handler PostProcessHandler(Queue, OrigNeedsEvent)
3938
- // auto E = CGH.finalize(); // enqueue original or current last
3939
- // // post-process
3940
- // if (!InOrder)
3941
- // PostProcessHandler.depends_on(E)
3942
- //
3943
- // swap_impls(CGH, PostProcessHandler)
3944
- // return; // queue::submit finalizes PostProcessHandler and returns its
3945
- // // event if necessary.
3946
- //
3947
- // Still hackier than "real" `queue::submit` but at least somewhat sane.
3948
- // That, however hasn't been tried yet and we have an even hackier approach
3949
- // copied from what's been done in an old reductions implementation before
3950
- // eventless submission work has started. Not sure how feasible the approach
3951
- // above is at this moment.
3952
-
3953
- // This `finalize` is wrong (at least logically) if
3954
- // `assert(!CGH.eventNeeded())`
3955
- auto E = CGH.finalize ();
3956
- dependent_queue_t <FunctorTy> Queue = CGH.getQueue ();
3957
- bool InOrder = Queue.is_in_order ();
3958
- // Cannot use `CGH.eventNeeded()` alone as there might be subsequent
3959
- // `postProcess` calls and we cannot address them properly similarly to the
3960
- // `finalize` issue described above. `swap_impls` suggested above might be
3961
- // able to handle this scenario naturally.
3962
- handler AuxHandler (getSyclObjImpl (Queue), CGH.eventNeeded () || !InOrder);
3963
- if (!InOrder)
3964
- AuxHandler.depends_on (E);
3965
- AuxHandler.copyCodeLoc (CGH);
3966
- std::forward<FunctorTy>(Func)(AuxHandler);
3967
- CGH.MLastEvent = AuxHandler.finalize ();
3968
- return ;
3905
+ static void postProcess (handler &CGH, FunctorTy &Func) {
3906
+ postProcess (CGH, type_erased_cgfo_ty{Func});
3969
3907
}
3970
3908
};
3971
3909
} // namespace detail
0 commit comments