Skip to content

Commit faffe6e

Browse files
committed
Fixes #175, #458: Added graph support, including stream capture and a graph node builder class.
Also added two modified CUDA sample programs using graph support: * graphMemoryNodes * jacobiCudaGraphs
1 parent 6b4a9dd commit faffe6e

File tree

25 files changed

+4459
-30
lines changed

25 files changed

+4459
-30
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ write_basic_package_version_file(
163163
COMPATIBILITY ${COMPAT_SETTING}
164164
)
165165

166+
166167
install(
167168
FILES "${CMAKE_CURRENT_BINARY_DIR}/cuda-api-wrappers-config-version.cmake"
168169
DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cuda-api-wrappers"

README.md

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,13 +115,7 @@ Since this is a header-only library, you can simply add the `src/` subdirectory
115115

116116
## Coverage of the APIs
117117

118-
Most, but not all, API calls in the Runtime, Driver, NVTX and NVRTC are covered by these wrappers. Specifically, the following are missing:
119-
120-
* Execution graph management
121-
* CUDA 12.x "texture objects", "surface objects" and "tensor objects" (textures and texture references, introduced in earlier CUDA versions, are supported)
122-
* Interoperability with OpenGL, Direct3D, EGL, VDAPU.
123-
124-
Support for textures, arrays and surfaces exists, but is partial: Not all relevant API functions are covered.
118+
Most, but not quite all, API calls in the Runtime, Driver, NVTX and NVRTC are covered by these wrappers. You can find the main omissions as [issues tagged with "missing-cuda-feature"](https://github.com/eyalroz/cuda-api-wrappers/issues?q=is%3Aissue+is%3Aopen+label%3Amissing-cuda-feature), intended for further development work. Additionally, the wrapper library does not cover the APIs for interoperability with OpenGL, Direct3D, EGL and VDAPU.
125119

126120
The [Milestones](https://github.com/eyalroz/cuda-api-wrappers/milestones) indicates some features which aren't covered and are slated for future work. Since I am not currently working on anything graphics-related, there are no short-term plans to extend coverage to more graphics-related APIs; however - PRs are welcome.
127121

examples/CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,20 @@ if(USE_COOPERATIVE_GROUPS AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0")
144144
add_executable(binaryPartitionCG modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu)
145145
endif()
146146
add_executable(bandwidthtest modified_cuda_samples/bandwidthtest/bandwidthtest.cpp)
147+
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.0")
148+
if((NOT MSVC)
149+
OR (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.2")
150+
OR (CUDAToolkit_VERSION VERSION_LESS "11.7")
151+
)
152+
add_executable(simpleCudaGraphs modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu)
153+
# Note: Between versions 11.7 and 12.1.1, CUDA's own cooperative groups header
154+
# tries to include the CUDA faux standard library - which it shouldn't
155+
add_executable(jacobiCudaGraphs
156+
modified_cuda_samples/jacobiCudaGraphs/main.cpp
157+
modified_cuda_samples/jacobiCudaGraphs/jacobi.cu
158+
)
159+
endif()
160+
endif()
147161
#----
148162

149163
add_executable(version_management by_api_module/version_management.cpp)

examples/common.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,18 @@ bool your_type_was_() { return true; }
3535
#define print_type_of(_x) your_type_was_<decltype(_x)>()
3636
#endif
3737

38+
inline const char* ordinal_suffix(int n)
39+
{
40+
static const char suffixes [4][5] = {"th", "st", "nd", "rd"};
41+
auto ord = n % 100;
42+
if (ord / 10 == 1) { ord = 0; }
43+
ord = ord % 10;
44+
return suffixes[ord > 3 ? 0 : ord];
45+
}
46+
47+
template <typename N = int>
48+
inline ::std::string xth(N n) { return ::std::to_string(n) + ordinal_suffix(n); }
49+
3850
const char* cache_preference_name(cuda::multiprocessor_cache_preference_t pref)
3951
{
4052
static const char* cache_preference_names[] = {
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
#include "jacobi_kernels.cuh"
2+
#include "jacobi.h"
3+
4+
#include <vector>
5+
#include <iomanip>
6+
#include <iostream>
7+
#include <cuda/api.hpp>
8+
9+
static void finalize_error(
10+
const cuda::stream_t& stream, span<double> d_sum, const cuda::launch_configuration_t& launch_config,
11+
double& sum, int k, const span<double> x_to_overwrite)
12+
{
13+
stream.enqueue.memzero(d_sum);
14+
auto final_error_launch_config = launch_config;
15+
final_error_launch_config.dimensions.grid.x = (N_ROWS / final_error_launch_config.dimensions.block.x) + 1;
16+
auto warps_per_block = final_error_launch_config.dimensions.block.x / cuda::warp_size;
17+
final_error_launch_config.dynamic_shared_memory_size = (warps_per_block + 1) * sizeof(double);
18+
// TODO: Double-check the original source to ensure we're using the right x here
19+
stream.enqueue.kernel_launch(finalError, final_error_launch_config, x_to_overwrite.data(), d_sum.data());
20+
stream.enqueue.copy(&sum, d_sum);
21+
stream.synchronize();
22+
report_error_sum("GPU", k + 1, sum);
23+
}
24+
25+
template<>
26+
double do_jacobi_inner<computation_method_t::graph_with_set_kernel_params>(
27+
const cuda::device_t &device,
28+
const cuda::stream_t &stream,
29+
span<float const> A,
30+
span<double const> b,
31+
float convergence_threshold,
32+
int num_iterations,
33+
span<double> x,
34+
span<double> x_new,
35+
span<double> d_sum)
36+
{
37+
auto launch_config = cuda::launch_config_builder()
38+
.block_size(256)
39+
.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
40+
.build();
41+
42+
double sum;
43+
44+
auto graph = cuda::graph::create();
45+
46+
using cuda::graph::node::kind_t;
47+
48+
auto memset_node = [&] {
49+
cuda::graph::node::parameters_t<kind_t::memory_set> params;
50+
params.value = 0;
51+
params.width_in_bytes = 4;
52+
params.region = d_sum;
53+
return graph.insert.node<kind_t::memory_set>(params);
54+
}();
55+
56+
auto jacobi_kernel = cuda::kernel::get(device, JacobiMethod);
57+
struct { cuda::graph::node::parameters_t<kind_t::kernel_launch> odd, even; } kernel_params = {
58+
{ jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x, x_new, d_sum) },
59+
{ jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x_new, x, d_sum) },
60+
};
61+
auto jacobi_kernel_node = graph.insert.node<kind_t::kernel_launch>(kernel_params.even);
62+
63+
graph.insert.edge(memset_node, jacobi_kernel_node);
64+
65+
auto memcpy_node = [&] {
66+
cuda::memory::copy_parameters_t<3> params;
67+
params.set_source(d_sum);
68+
params.set_destination(&sum, 1);
69+
params.set_extent<double>(1);
70+
params.clear_offsets();
71+
params.clear_rest();
72+
return graph.insert.node<cuda::graph::node::kind_t::memcpy>(params);
73+
}();
74+
75+
graph.insert.edge(jacobi_kernel_node, memcpy_node);
76+
77+
78+
cuda::graph::instance_t instance = graph.instantiate();
79+
80+
// ::std::cout << "settings node params for the kernel node with k == " << k << " and params.marshalled_arguments.size() = "
81+
// << params.marshalled_arguments.size() << std::endl;
82+
83+
for (int k = 0; k < num_iterations; k++) {
84+
instance.launch(stream);
85+
stream.synchronize();
86+
87+
if (sum <= convergence_threshold) {
88+
auto x_to_overwrite = ((k & 1) == 0) ? x : x_new;
89+
finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
90+
break;
91+
}
92+
// Odd iterations have an even value of k, since we start with k == 0;
93+
// but - here we sent
94+
const auto& next_iteration_params = ((k & 1) == 0) ? kernel_params.even : kernel_params.odd;
95+
instance.set_node_parameters<kind_t::kernel_launch>(jacobi_kernel_node, next_iteration_params);
96+
}
97+
return sum;
98+
}
99+
100+
template<>
101+
double do_jacobi_inner<computation_method_t::graph_with_exec_update>(
102+
const cuda::device_t &,
103+
const cuda::stream_t &stream,
104+
span<float const> A,
105+
span<double const> b,
106+
float convergence_threshold,
107+
int num_iterations,
108+
span<double> x,
109+
span<double> x_new,
110+
span<double> d_sum)
111+
{
112+
auto launch_config = cuda::launch_config_builder()
113+
.block_size(256)
114+
.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
115+
.build();
116+
117+
::std::unique_ptr<cuda::graph::instance_t> instance_ptr{};
118+
119+
double sum = 0.0;
120+
for (int k = 0; k < num_iterations; k++) {
121+
stream.begin_capture(cuda::stream::capture::mode_t::global);
122+
stream.enqueue.memzero(d_sum);
123+
auto x_to_read = ((k & 1) == 0) ? x : x_new;
124+
auto x_to_overwrite = ((k & 1) == 0) ? x_new : x;
125+
stream.enqueue.kernel_launch(JacobiMethod, launch_config,
126+
A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data());
127+
stream.enqueue.copy(&sum, d_sum);
128+
auto graph = stream.end_capture();
129+
130+
if (instance_ptr == nullptr) {
131+
auto instance = graph.instantiate();
132+
instance_ptr.reset(new cuda::graph::instance_t{::std::move(instance)});
133+
}
134+
else {
135+
instance_ptr->update(graph);
136+
// Note: The original code tried to re-instantiate if the update
137+
// of the instance failed, we don't do this.
138+
}
139+
stream.enqueue.graph_launch(*instance_ptr);
140+
stream.synchronize();
141+
142+
if (sum <= convergence_threshold) {
143+
finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
144+
break;
145+
}
146+
}
147+
148+
return sum;
149+
}
150+
151+
template<>
152+
double do_jacobi_inner<computation_method_t::non_graph_gpu>(
153+
const cuda::device_t &,
154+
const cuda::stream_t &stream,
155+
span<float const> A,
156+
span<double const> b,
157+
float convergence_threshold,
158+
int num_iterations,
159+
span<double> x,
160+
span<double> x_new,
161+
span<double> d_sum)
162+
{
163+
auto launch_config = cuda::launch_config_builder()
164+
.block_size(256)
165+
.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
166+
.build();
167+
168+
double sum;
169+
for (int k = 0; k < num_iterations; k++) {
170+
stream.enqueue.memzero(d_sum);
171+
auto x_to_read = ((k & 1) == 0) ? x : x_new;
172+
auto x_to_overwrite = ((k & 1) == 0) ? x_new : x;
173+
stream.enqueue.kernel_launch(JacobiMethod, launch_config,
174+
A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data());
175+
stream.enqueue.copy(&sum, d_sum);
176+
stream.synchronize();
177+
178+
if (sum <= convergence_threshold) {
179+
finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
180+
break;
181+
}
182+
}
183+
184+
return sum;
185+
}
186+
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2+
* Modifications Copyright (c) 2023, Eyal Rozenberg <[email protected]>
3+
*
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions
6+
* are met:
7+
* * Redistributions of source code must retain the above copyright
8+
* notice, this list of conditions and the following disclaimer.
9+
* * Redistributions in binary form must reproduce the above copyright
10+
* notice, this list of conditions and the following disclaimer in the
11+
* documentation and/or other materials provided with the distribution.
12+
* * Neither the name of NVIDIA CORPORATION nor the names of its
13+
* contributors may be used to endorse or promote products derived
14+
* from this software without specific prior written permission.
15+
*
16+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
*/
28+
29+
#ifndef JACOBI_H
30+
#define JACOBI_H
31+
32+
#define N_ROWS 512
33+
34+
#include <cuda/api.hpp>
35+
36+
#if __cplusplus >= 202001L
37+
using span = ::std::span;
38+
#else
39+
using ::cuda::span;
40+
#endif
41+
42+
#define N_ROWS 512
43+
44+
enum computation_method_t {
45+
graph_with_set_kernel_params = 0,
46+
graph_with_exec_update = 1,
47+
non_graph_gpu = 2,
48+
cpu = 3
49+
};
50+
51+
inline const char* method_name(computation_method_t method)
52+
{
53+
static const char* method_names[] = {
54+
"graph_with_set_kernel_params",
55+
"graph_with_exec_update",
56+
"non_graph_gpu",
57+
"cpu"
58+
};
59+
return method_names[method];
60+
}
61+
62+
void report_error_sum(const char* where, int num_iterations, double sum_on_cpu);
63+
64+
template <computation_method_t Method>
65+
double do_jacobi_inner(
66+
const cuda:: device_t& device,
67+
const cuda::stream_t &stream,
68+
span<float const> A,
69+
span<double const> b,
70+
float conv_threshold,
71+
int num_iterations,
72+
span<double> x,
73+
span<double> x_new,
74+
span<double> d_sum);
75+
76+
77+
#endif // JACOBI_H

0 commit comments

Comments
 (0)