-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathmatrix_multiply.cu
390 lines (348 loc) · 13.2 KB
/
matrix_multiply.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
// This plugin multiplies two square matrices, using a not-particularly-
// optimized matrix multiply kernel. This uses a 2D grid of 2D blocks, mapping
// one thread per resulting matrix element. The block_count setting from the
// config is ignored, and thread_count must be 2 dimensional. The
// additional_info must be a JSON object with the following keys:
//
// - "matrix_width": The width of the square matrix of floating-point numbers
// to multiply.
//
// - "skip_copy": A boolean. Optional, defaults to false. If it's set to true,
// then the "copy_in" and "copy_out" phases will not copy the matrix data to
// or from the GPU. Instead, the input matrices will only be copied once,
// during the initialize function, and output data will never be copied. May
// be useful if you want a simple workload without as many memory transfers.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>
#include "third_party/cJSON.h"
#include "benchmark_gpu_utilities.h"
#include "library_interface.h"
// Holds the state of an instance of this plugin.
typedef struct {
cudaStream_t stream;
// Tracking whether the stream is created simplifies cleanup code.
int stream_created;
// The grid size is computed based on the matrix width. We use an array
// rather than a dim3 here in order to allow memset on PluginState. (The Z
// dimension is always 1.)
int grid_size[2];
// The block size is determined by the thread_count setting in the config.
// (See the note on grid_size.)
int block_size[2];
// Set this to nonzero in order to not copy input or output matrices, apart
// from during initialization.
int skip_copy;
// The width of each square matrix.
int matrix_width;
// The device-side matrices. The computation will be d_c = d_a x d_b.
float *d_a, *d_b, *d_c;
// The three host-side copies of the matrices.
float *h_a, *h_b, *h_c;
// The recordings of the start and end GPU clock cycle for each block.
uint64_t *device_block_times;
// Holds the device copy of the SMID each block was assigned to.
uint32_t *device_block_smids;
// Holds times that are shared with the plugin host.
KernelTimes kernel_times;
} PluginState;
// Implements the Cleanup() function required by the plugin interface.
static void Cleanup(void *data) {
PluginState *state = (PluginState *) data;
if (!state) return;
cudaFree(state->d_a);
cudaFree(state->d_b);
cudaFree(state->d_c);
cudaFreeHost(state->h_a);
cudaFreeHost(state->h_b);
cudaFreeHost(state->h_c);
cudaFreeHost(state->kernel_times.block_times);
cudaFreeHost(state->kernel_times.block_smids);
if (state->stream_created) {
CheckCUDAError(cudaStreamDestroy(state->stream));
}
// Free device memory.
if (state->device_block_times) cudaFree(state->device_block_times);
if (state->device_block_smids) cudaFree(state->device_block_smids);
// Clear memory so that use-after-free bugs are obvious
memset(state, 0, sizeof(*state));
free(state);
}
static float RandomFloat(void) {
// Maybe replace this with something faster?
float to_return = ((float) rand()) / ((float) RAND_MAX);
return to_return;
}
// Allocates the host and device matrices, and randomly initializes them.
// Returns 0 on error. Must be called after grid_size has been initialized.
static int AllocateMemory(PluginState *state) {
int i, j, width, block_count;
size_t size, smids_size;
width = state->matrix_width;
block_count = state->grid_size[0] * state->grid_size[1];
// Allocate host and device memory for block times.
size = block_count * 2 * sizeof(uint64_t);
if (!CheckCUDAError(cudaMallocHost(&(state->kernel_times.block_times),
size))) {
return 0;
}
if (!CheckCUDAError(cudaMalloc(&(state->device_block_times), size))) {
return 0;
}
smids_size = block_count * sizeof(uint32_t);
if (!CheckCUDAError(cudaMallocHost(&(state->kernel_times.block_smids),
smids_size))) {
return 0;
}
if (!CheckCUDAError(cudaMalloc(&(state->device_block_smids), smids_size))) {
return 0;
}
// Allocate the matrices.
size = width * width * sizeof(float);
if (!CheckCUDAError(cudaMalloc(&state->d_a, size))) return 0;
if (!CheckCUDAError(cudaMalloc(&state->d_b, size))) return 0;
if (!CheckCUDAError(cudaMalloc(&state->d_c, size))) return 0;
if (!CheckCUDAError(cudaMallocHost(&state->h_a, size))) return 0;
if (!CheckCUDAError(cudaMallocHost(&state->h_b, size))) return 0;
if (!CheckCUDAError(cudaMallocHost(&state->h_c, size))) return 0;
memset(state->h_c, 0, size);
// Randomly initialize the host's input matrices.
for (i = 0; i < width; i++) {
for (j = 0; j < width; j++) {
state->h_a[i * width + j] = RandomFloat();
state->h_b[i * width + j] = RandomFloat();
}
}
// Initialize the input matrices on the host.
if (!CheckCUDAError(cudaMemcpyAsync(state->d_a, state->h_a, size,
cudaMemcpyHostToDevice, state->stream))) {
return 0;
}
if (!CheckCUDAError(cudaMemcpyAsync(state->d_b, state->h_b, size,
cudaMemcpyHostToDevice, state->stream))) {
return 0;
}
if (!CheckCUDAError(cudaStreamSynchronize(state->stream))) return 0;
return 1;
}
// Parses the additional_info argument. Returns 0 on error.
static int ParseAdditionalInfo(const char *arg, PluginState *state) {
cJSON *root = NULL;
cJSON *entry = NULL;
root = cJSON_Parse(arg);
if (!root) {
printf("Invalid additional_info for matrix_multiply.\n");
return 0;
}
// Make sure that matrix_width is present and positive.
entry = cJSON_GetObjectItem(root, "matrix_width");
if (!entry || (entry->type != cJSON_Number)) {
printf("Invalid matrix_width setting.\n");
cJSON_Delete(root);
return 0;
}
if (entry->valuedouble < 1.0) {
printf("The matrix_width setting must be at least 1.\n");
cJSON_Delete(root);
return 0;
}
if (entry->valuedouble > 1e6) {
printf("Warning: huge matrix width: %f\n", entry->valuedouble);
}
state->matrix_width = (int) entry->valuedouble;
// If skip_copy is present, make sure it's a boolean. state->skip_copy is
// initialized to 0 already.
entry = cJSON_GetObjectItem(root, "skip_copy");
if (entry) {
if ((entry->type != cJSON_True) && (entry->type != cJSON_False)) {
printf("The skip_copy setting must be a boolean.\n");
cJSON_Delete(root);
return 0;
}
state->skip_copy = entry->type == cJSON_True;
}
cJSON_Delete(root);
root = NULL;
entry = NULL;
return 1;
}
static void* Initialize(InitializationParameters *params) {
PluginState *state = NULL;
int blocks_wide, blocks_tall, matrix_width;
if (!CheckCUDAError(cudaSetDevice(params->cuda_device))) return NULL;
state = (PluginState *) calloc(sizeof(*state), 1);
if (!state) {
printf("Failed allocating plugin state.\n");
return NULL;
}
// We need to know the matrix width before we can allocate memory or
// determine grid size.
if (!ParseAdditionalInfo(params->additional_info, state)) {
Cleanup(state);
return NULL;
}
matrix_width = state->matrix_width;
state->block_size[0] = params->block_dim[0];
if (params->block_dim[1] == 1) {
// Print a warning here; the old behavior was different so this will help
// catch and update configs expecting the old version. (e.g. we probably
// want to use 32x32 blocks rather than 1024x1 blocks!)
printf("Warning! Specified a 1-D block dim for matrix multiply.\n");
printf("HACK: Defaulting to 32x32.\n");
state->block_size[0] = 32;
state->block_size[1] = 32;
}
state->block_size[1] = params->block_dim[1];
// Compute the grid size from the block size and matrix width.
blocks_wide = matrix_width / state->block_size[0];
if ((matrix_width % state->block_size[0]) != 0) blocks_wide++;
blocks_tall = matrix_width / state->block_size[1];
if ((matrix_width % state->block_size[1]) != 0) blocks_tall++;
state->grid_size[0] = blocks_wide;
state->grid_size[1] = blocks_tall;
// Create the stream and fill in boilerplate for reporting to the framework.
if (!CheckCUDAError(CreateCUDAStreamWithPriorityAndMask(
params->stream_priority, params->sm_mask, &(state->stream)))) {
Cleanup(state);
return NULL;
}
state->stream_created = 1;
state->kernel_times.kernel_name = "matrix_multiply";
state->kernel_times.thread_count = state->block_size[0] * state->block_size[1];
state->kernel_times.block_count = state->grid_size[0] * state->grid_size[1];
state->kernel_times.shared_memory = 0;
// Allocate the matrices and initialize the input matrices
if (!AllocateMemory(state)) {
Cleanup(state);
return NULL;
}
return state;
}
// Reset block times and copy input matrices.
static int CopyIn(void *data) {
PluginState *state = (PluginState *) data;
int block_count = state->grid_size[0] * state->grid_size[1];
// Reset block times
size_t size = block_count * 2 * sizeof(uint64_t);
if (!CheckCUDAError(cudaMemsetAsync(state->device_block_times, 0xff,
size, state->stream))) {
return 0;
}
// If we're skipping the copies we can return now.
if (state->skip_copy) {
if (!CheckCUDAError(cudaStreamSynchronize(state->stream))) return 0;
return 1;
}
// Copy input matrices.
size = state->matrix_width * state->matrix_width * sizeof(float);
if (!CheckCUDAError(cudaMemcpyAsync(state->d_a, state->h_a, size,
cudaMemcpyHostToDevice, state->stream))) {
return 0;
}
if (!CheckCUDAError(cudaMemcpyAsync(state->d_b, state->h_b, size,
cudaMemcpyHostToDevice, state->stream))) {
return 0;
}
if (!CheckCUDAError(cudaStreamSynchronize(state->stream))) return 0;
return 1;
}
// The GPU kernel for carrying out matrix multiplication. Expects a 2D grid
// with sufficient threads to cover the entire matrix.
__global__ void MatrixMultiplyKernel(float *a, float *b, float *c, int width,
uint64_t *block_times, uint32_t *block_smids) {
int row, col, k, block_index;
float v_a, v_b, v_c;
uint64_t start_clock = clock64();
block_index = blockIdx.y * gridDim.x + blockIdx.x;
if (start_clock < block_times[block_index * 2]) {
block_times[block_index * 2] = start_clock;
block_smids[block_index] = GetSMID();
}
// The row and column of the element in the output matrix is determined by
// the thread's position in the 2D grid.
col = blockIdx.x * blockDim.x + threadIdx.x;
row = blockIdx.y * blockDim.y + threadIdx.y;
if ((col >= width) || (row >= width)) {
// Don't try doing computations if we're outside of the matrix.
block_times[block_index * 2 + 1] = clock64();
return;
}
// Actually carry out the multiplication for this thread's element.
v_c = 0;
for (k = 0; k < width; k++) {
v_a = a[row * width + k];
v_b = b[k * width + col];
v_c += v_a * v_b;
}
c[row * width + col] = v_c;
block_times[block_index * 2 + 1] = clock64();
}
static int Execute(void *data) {
PluginState *state = (PluginState *) data;
state->kernel_times.cuda_launch_times[0] = CurrentSeconds();
dim3 block_size, grid_size;
grid_size.x = state->grid_size[0];
grid_size.y = state->grid_size[1];
grid_size.z = 1;
block_size.x = state->block_size[0];
block_size.y = state->block_size[1];
block_size.z = 1;
MatrixMultiplyKernel<<<grid_size, block_size, 0, state->stream>>>(
state->d_a, state->d_b, state->d_c, state->matrix_width,
state->device_block_times, state->device_block_smids);
state->kernel_times.cuda_launch_times[1] = CurrentSeconds();
if (!CheckCUDAError(cudaStreamSynchronize(state->stream))) return 0;
state->kernel_times.cuda_launch_times[2] = CurrentSeconds();
return 1;
}
// Copy the block times to the host, along with the result matrix.
static int CopyOut(void *data, TimingInformation *times) {
PluginState *state = (PluginState *) data;
int block_count = state->grid_size[0] * state->grid_size[1];
size_t size;
// First copy the block times to the host.
size = block_count * 2 * sizeof(uint64_t);
if (!CheckCUDAError(cudaMemcpyAsync(state->kernel_times.block_times,
state->device_block_times, size, cudaMemcpyDeviceToHost, state->stream))) {
return 0;
}
if (!CheckCUDAError(cudaMemcpyAsync(state->kernel_times.block_smids,
state->device_block_smids, block_count * sizeof(uint32_t),
cudaMemcpyDeviceToHost, state->stream))) {
return 0;
}
// Provide the framework with a pointer to our kernel_times struct.
times->kernel_count = 1;
times->kernel_info = &(state->kernel_times);
// We can return now if we're not copying the result matrix.
if (state->skip_copy) {
times->resulting_data_size = 0;
times->resulting_data = NULL;
if (!CheckCUDAError(cudaStreamSynchronize(state->stream))) return 0;
return 1;
}
// Copy the result matrix.
size = state->matrix_width * state->matrix_width * sizeof(float);
if (!CheckCUDAError(cudaMemcpyAsync(state->h_c, state->d_c, size,
cudaMemcpyDeviceToHost, state->stream))) {
return 0;
}
times->resulting_data_size = size;
times->resulting_data = state->h_c;
if (!CheckCUDAError(cudaStreamSynchronize(state->stream))) return 0;
return 1;
}
static const char* GetName(void) {
return "Matrix Multiply";
}
int RegisterFunctions(BenchmarkLibraryFunctions *functions) {
functions->get_name = GetName;
functions->cleanup = Cleanup;
functions->initialize = Initialize;
functions->copy_in = CopyIn;
functions->execute = Execute;
functions->copy_out = CopyOut;
return 1;
}