@@ -140,17 +140,10 @@ template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
140
140
return sycl::detail::make_tuple (Elements...);
141
141
}
142
142
143
- #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
144
- __SYCL_EXPORT size_t reduGetMaxWGSize (const std::shared_ptr<queue_impl> &Queue,
143
+ __SYCL_EXPORT size_t reduGetMaxWGSize (handler &cgh,
145
144
size_t LocalMemBytesPerWorkItem);
146
- __SYCL_EXPORT size_t reduGetPreferredWGSize (
147
- const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem);
148
- #else
149
- __SYCL_EXPORT size_t reduGetMaxWGSize (std::shared_ptr<queue_impl> Queue,
150
- size_t LocalMemBytesPerWorkItem);
151
- __SYCL_EXPORT size_t reduGetPreferredWGSize (std::shared_ptr<queue_impl> &Queue,
145
+ __SYCL_EXPORT size_t reduGetPreferredWGSize (handler &cgh,
152
146
size_t LocalMemBytesPerWorkItem);
153
- #endif
154
147
__SYCL_EXPORT size_t reduComputeWGSize (size_t NWorkItems, size_t MaxWGSize,
155
148
size_t &NWorkGroups);
156
149
@@ -1224,15 +1217,12 @@ template <>
1224
1217
struct NDRangeReduction <reduction::strategy::local_atomic_and_atomic_cross_wg> {
1225
1218
template <typename KernelName, int Dims, typename PropertiesT,
1226
1219
typename KernelType, typename Reduction>
1227
- static void run (handler &CGH,
1228
- const std::shared_ptr<detail::queue_impl> &Queue,
1229
- nd_range<Dims> NDRange, PropertiesT &Properties,
1220
+ static void run (handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
1230
1221
Reduction &Redu, KernelType &KernelFunc) {
1231
1222
static_assert (Reduction::has_identity,
1232
1223
" Identityless reductions are not supported by the "
1233
1224
" local_atomic_and_atomic_cross_wg strategy." );
1234
1225
1235
- std::ignore = Queue;
1236
1226
using Name = __sycl_reduction_kernel<
1237
1227
reduction::MainKrn, KernelName,
1238
1228
reduction::strategy::local_atomic_and_atomic_cross_wg>;
@@ -1276,15 +1266,12 @@ struct NDRangeReduction<
1276
1266
reduction::strategy::group_reduce_and_last_wg_detection> {
1277
1267
template <typename KernelName, int Dims, typename PropertiesT,
1278
1268
typename KernelType, typename Reduction>
1279
- static void run (handler &CGH,
1280
- const std::shared_ptr<detail::queue_impl> &Queue,
1281
- nd_range<Dims> NDRange, PropertiesT &Properties,
1269
+ static void run (handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
1282
1270
Reduction &Redu, KernelType &KernelFunc) {
1283
1271
static_assert (Reduction::has_identity,
1284
1272
" Identityless reductions are not supported by the "
1285
1273
" group_reduce_and_last_wg_detection strategy." );
1286
1274
1287
- std::ignore = Queue;
1288
1275
size_t NElements = Reduction::num_elements;
1289
1276
size_t WGSize = NDRange.get_local_range ().size ();
1290
1277
size_t NWorkGroups = NDRange.get_group_range ().size ();
@@ -1476,9 +1463,7 @@ void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
1476
1463
template <> struct NDRangeReduction <reduction::strategy::range_basic> {
1477
1464
template <typename KernelName, int Dims, typename PropertiesT,
1478
1465
typename KernelType, typename Reduction>
1479
- static void run (handler &CGH,
1480
- const std::shared_ptr<detail::queue_impl> &Queue,
1481
- nd_range<Dims> NDRange, PropertiesT &Properties,
1466
+ static void run (handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
1482
1467
Reduction &Redu, KernelType &KernelFunc) {
1483
1468
using reducer_type = typename Reduction::reducer_type;
1484
1469
using element_type = typename ReducerTraits<reducer_type>::element_type;
@@ -1490,7 +1475,6 @@ template <> struct NDRangeReduction<reduction::strategy::range_basic> {
1490
1475
constexpr bool UsePartialSumForOutput =
1491
1476
!Reduction::is_usm && Reduction::has_identity;
1492
1477
1493
- std::ignore = Queue;
1494
1478
size_t NElements = Reduction::num_elements;
1495
1479
size_t WGSize = NDRange.get_local_range ().size ();
1496
1480
size_t NWorkGroups = NDRange.get_group_range ().size ();
@@ -1588,15 +1572,12 @@ template <>
1588
1572
struct NDRangeReduction <reduction::strategy::group_reduce_and_atomic_cross_wg> {
1589
1573
template <typename KernelName, int Dims, typename PropertiesT,
1590
1574
typename KernelType, typename Reduction>
1591
- static void run (handler &CGH,
1592
- const std::shared_ptr<detail::queue_impl> &Queue,
1593
- nd_range<Dims> NDRange, PropertiesT &Properties,
1575
+ static void run (handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
1594
1576
Reduction &Redu, KernelType &KernelFunc) {
1595
1577
static_assert (Reduction::has_identity,
1596
1578
" Identityless reductions are not supported by the "
1597
1579
" group_reduce_and_atomic_cross_wg strategy." );
1598
1580
1599
- std::ignore = Queue;
1600
1581
using Name = __sycl_reduction_kernel<
1601
1582
reduction::MainKrn, KernelName,
1602
1583
reduction::strategy::group_reduce_and_atomic_cross_wg>;
@@ -1625,14 +1606,11 @@ struct NDRangeReduction<
1625
1606
reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
1626
1607
template <typename KernelName, int Dims, typename PropertiesT,
1627
1608
typename KernelType, typename Reduction>
1628
- static void run (handler &CGH,
1629
- const std::shared_ptr<detail::queue_impl> &Queue,
1630
- nd_range<Dims> NDRange, PropertiesT &Properties,
1609
+ static void run (handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
1631
1610
Reduction &Redu, KernelType &KernelFunc) {
1632
1611
using reducer_type = typename Reduction::reducer_type;
1633
1612
using element_type = typename ReducerTraits<reducer_type>::element_type;
1634
1613
1635
- std::ignore = Queue;
1636
1614
using Name = __sycl_reduction_kernel<
1637
1615
reduction::MainKrn, KernelName,
1638
1616
reduction::strategy::local_mem_tree_and_atomic_cross_wg>;
@@ -1687,9 +1665,7 @@ struct NDRangeReduction<
1687
1665
reduction::strategy::group_reduce_and_multiple_kernels> {
1688
1666
template <typename KernelName, int Dims, typename PropertiesT,
1689
1667
typename KernelType, typename Reduction>
1690
- static void run (handler &CGH,
1691
- const std::shared_ptr<detail::queue_impl> &Queue,
1692
- nd_range<Dims> NDRange, PropertiesT &Properties,
1668
+ static void run (handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
1693
1669
Reduction &Redu, KernelType &KernelFunc) {
1694
1670
static_assert (Reduction::has_identity,
1695
1671
" Identityless reductions are not supported by the "
@@ -1708,7 +1684,7 @@ struct NDRangeReduction<
1708
1684
// TODO: currently the maximal work group size is determined for the given
1709
1685
// queue/device, while it may be safer to use queries to the kernel compiled
1710
1686
// for the device.
1711
- size_t MaxWGSize = reduGetMaxWGSize (Queue , OneElemSize);
1687
+ size_t MaxWGSize = reduGetMaxWGSize (CGH , OneElemSize);
1712
1688
if (NDRange.get_local_range ().size () > MaxWGSize)
1713
1689
throw sycl::exception (make_error_code (errc::nd_range),
1714
1690
" The implementation handling parallel_for with"
@@ -1826,9 +1802,7 @@ struct NDRangeReduction<
1826
1802
template <> struct NDRangeReduction <reduction::strategy::basic> {
1827
1803
template <typename KernelName, int Dims, typename PropertiesT,
1828
1804
typename KernelType, typename Reduction>
1829
- static void run (handler &CGH,
1830
- const std::shared_ptr<detail::queue_impl> &Queue,
1831
- nd_range<Dims> NDRange, PropertiesT &Properties,
1805
+ static void run (handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
1832
1806
Reduction &Redu, KernelType &KernelFunc) {
1833
1807
using element_type = typename Reduction::reducer_element_type;
1834
1808
@@ -1837,7 +1811,7 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
1837
1811
// TODO: currently the maximal work group size is determined for the given
1838
1812
// queue/device, while it may be safer to use queries to the kernel
1839
1813
// compiled for the device.
1840
- size_t MaxWGSize = reduGetMaxWGSize (Queue , OneElemSize);
1814
+ size_t MaxWGSize = reduGetMaxWGSize (CGH , OneElemSize);
1841
1815
if (NDRange.get_local_range ().size () > MaxWGSize)
1842
1816
throw sycl::exception (make_error_code (errc::nd_range),
1843
1817
" The implementation handling parallel_for with"
@@ -2602,9 +2576,8 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
2602
2576
template <> struct NDRangeReduction <reduction::strategy::multi> {
2603
2577
template <typename KernelName, int Dims, typename PropertiesT,
2604
2578
typename ... RestT>
2605
- static void
2606
- run (handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
2607
- nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
2579
+ static void run (handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
2580
+ RestT... Rest) {
2608
2581
std::tuple<RestT...> ArgsTuple (Rest...);
2609
2582
constexpr size_t NumArgs = sizeof ...(RestT);
2610
2583
auto KernelFunc = std::get<NumArgs - 1 >(ArgsTuple);
@@ -2615,7 +2588,7 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
2615
2588
// TODO: currently the maximal work group size is determined for the given
2616
2589
// queue/device, while it is safer to use queries to the kernel compiled
2617
2590
// for the device.
2618
- size_t MaxWGSize = reduGetMaxWGSize (Queue , LocalMemPerWorkItem);
2591
+ size_t MaxWGSize = reduGetMaxWGSize (CGH , LocalMemPerWorkItem);
2619
2592
if (NDRange.get_local_range ().size () > MaxWGSize)
2620
2593
throw sycl::exception (make_error_code (errc::nd_range),
2621
2594
" The implementation handling parallel_for with"
@@ -2646,13 +2619,10 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
2646
2619
2647
2620
template <typename KernelName, int Dims, typename PropertiesT,
2648
2621
typename KernelType, typename Reduction>
2649
- static void run (handler &CGH,
2650
- const std::shared_ptr<detail::queue_impl> &Queue,
2651
- nd_range<Dims> NDRange, PropertiesT &Properties,
2622
+ static void run (handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
2652
2623
Reduction &Redu, KernelType &KernelFunc) {
2653
2624
auto Delegate = [&](auto Impl) {
2654
- Impl.template run <KernelName>(CGH, Queue, NDRange, Properties, Redu,
2655
- KernelFunc);
2625
+ Impl.template run <KernelName>(CGH, NDRange, Properties, Redu, KernelFunc);
2656
2626
};
2657
2627
2658
2628
if constexpr (Reduction::has_float64_atomics) {
@@ -2694,10 +2664,9 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
2694
2664
}
2695
2665
template <typename KernelName, int Dims, typename PropertiesT,
2696
2666
typename ... RestT>
2697
- static void
2698
- run (handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
2699
- nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
2700
- return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
2667
+ static void run (handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
2668
+ RestT... Rest) {
2669
+ return Impl<Strat::multi>::run<KernelName>(CGH, NDRange, Properties,
2701
2670
Rest...);
2702
2671
}
2703
2672
};
@@ -2706,12 +2675,11 @@ template <typename KernelName, reduction::strategy Strategy, int Dims,
2706
2675
typename PropertiesT, typename ... RestT>
2707
2676
void reduction_parallel_for (handler &CGH, nd_range<Dims> NDRange,
2708
2677
PropertiesT Properties, RestT... Rest) {
2709
- NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH. MQueue , NDRange ,
2710
- Properties, Rest...);
2678
+ NDRangeReduction<Strategy>::template run<KernelName>(CGH, NDRange, Properties ,
2679
+ Rest...);
2711
2680
}
2712
2681
2713
- __SYCL_EXPORT uint32_t
2714
- reduGetMaxNumConcurrentWorkGroups (std::shared_ptr<queue_impl> Queue);
2682
+ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups (handler &cgh);
2715
2683
2716
2684
template <typename KernelName, reduction::strategy Strategy, int Dims,
2717
2685
typename PropertiesT, typename ... RestT>
@@ -2742,13 +2710,13 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
2742
2710
#ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
2743
2711
__SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
2744
2712
#else
2745
- reduGetMaxNumConcurrentWorkGroups (CGH. MQueue );
2713
+ reduGetMaxNumConcurrentWorkGroups (CGH);
2746
2714
#endif
2747
2715
2748
2716
// TODO: currently the preferred work group size is determined for the given
2749
2717
// queue/device, while it is safer to use queries to the kernel pre-compiled
2750
2718
// for the device.
2751
- size_t PrefWGSize = reduGetPreferredWGSize (CGH. MQueue , OneElemSize);
2719
+ size_t PrefWGSize = reduGetPreferredWGSize (CGH, OneElemSize);
2752
2720
2753
2721
size_t NWorkItems = Range.size ();
2754
2722
size_t WGSize = std::min (NWorkItems, PrefWGSize);
0 commit comments