-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsaxpy.cu
142 lines (111 loc) · 5.36 KB
/
saxpy.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <driver_functions.h>
#include "CycleTimer.h"
// return GB/sec
float GBPerSec(int bytes, float sec) {
return static_cast<float>(bytes) / (1024. * 1024. * 1024.) / sec;
}
// This is the CUDA "kernel" function that is run on the GPU. You
// know this because it is marked as a __global__ function.
__global__ void
saxpy_kernel(int N, float alpha, float* x, float* y, float* result) {
// compute overall thread index from position of thread in current
// block, and given the block we are in (in this example only a 1D
// calculation is needed so the code only looks at the .x terms of
// blockDim and threadIdx.
int index = blockIdx.x * blockDim.x + threadIdx.x;
// this check is necessary to make the code work for values of N
// that are not a multiple of the thread block size (blockDim.x)
if (index < N)
result[index] = alpha * x[index] + y[index];
}
// saxpyCuda --
//
// This function is regular C code running on the CPU. It allocates
// memory on the GPU using CUDA API functions, uses CUDA API functions
// to transfer data from the CPU's memory address space to GPU memory
// address space, and launches the CUDA kernel function on the GPU.
void saxpyCuda(int N, float alpha, float* xarray, float* yarray, float* resultarray) {
// must read both input arrays (xarray and yarray) and write to
// output array (resultarray)
int totalBytes = sizeof(float) * 3 * N;
// compute number of blocks and threads per block. In this
// application we've hardcoded thread blocks to contain 512 CUDA
// threads.
const int threadsPerBlock = 512;
// Notice the round up here. The code needs to compute the number
// of threads blocks needed such that there is one thread per
// element of the arrays. This code is written to work for values
// of N that are not multiples of threadPerBlock.
const int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
// These are pointers that will be pointers to memory allocated
// *one the GPU*. You should allocate these pointers via
// cudaMalloc. You can access the resulting buffers from CUDA
// device kernel code (see the kernel function saxpy_kernel()
// above) but you cannot access the contents these buffers from
// this thread. CPU threads cannot issue loads and stores from GPU
// memory!
float* device_x = nullptr;
float* device_y = nullptr;
float* device_result = nullptr;
double totalStartTime = CycleTimer::currentSeconds();
//
// CS149 TODO: allocate device memory buffers on the GPU using cudaMalloc.
//
// We highly recommend taking a look at NVIDIA's
// tutorial, which clearly walks you through the few lines of code
// you need to write for this part of the assignment:
//
// https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/
//
cudaMalloc(&device_x, N * sizeof(float));
cudaMalloc(&device_y, N * sizeof(float));
cudaMalloc(&device_result, N * sizeof(float));
// copy input arrays to the GPU using cudaMemcpy
cudaMemcpy(device_x, xarray, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(device_y, yarray, N * sizeof(float), cudaMemcpyHostToDevice);
// run CUDA kernel. (notice the <<< >>> brackets indicating a CUDA
// kernel launch) Execution on the GPU occurs here.
double startTime = CycleTimer::currentSeconds();
saxpy_kernel<<<blocks, threadsPerBlock>>>(N, alpha, device_x, device_y, device_result);
cudaDeviceSynchronize();
double endTime = CycleTimer::currentSeconds();
//copy result from GPU back to CPU using cudaMemcpy
cudaMemcpy(resultarray, device_result, N * sizeof(float), cudaMemcpyDeviceToHost);
double totalEndTime = CycleTimer::currentSeconds();
cudaError_t errCode = cudaPeekAtLastError();
if (errCode != cudaSuccess) {
fprintf(stderr, "WARNING: A CUDA error occured: code=%d, %s\n",
errCode, cudaGetErrorString(errCode));
}
double kernelDuration = totalEndTime - totalStartTime;
double overallDuration = endTime - startTime;
printf("Total Kernel Duration: %.3f ms\t\t[%.3f GB/s]\n", 1000.f * kernelDuration, GBPerSec(totalBytes, kernelDuration));
printf("Total Execution Time: %.3f ms\t\t[%.3f GB/s]\n", 1000.f * overallDuration, GBPerSec(totalBytes, overallDuration));
//
// CS149 TODO: free memory buffers on the GPU using cudaFree
//
cudaFree(device_x);
cudaFree(device_y);
cudaFree(device_result);
}
void printCudaInfo() {
// print out stats about the GPU in the machine. Useful if
// students want to know what GPU they are running on.
int deviceCount = 0;
cudaError_t err = cudaGetDeviceCount(&deviceCount);
printf("---------------------------------------------------------\n");
printf("Found %d CUDA devices\n", deviceCount);
for (int i=0; i<deviceCount; i++) {
cudaDeviceProp deviceProps;
cudaGetDeviceProperties(&deviceProps, i);
printf("Device %d: %s\n", i, deviceProps.name);
printf(" SMs: %d\n", deviceProps.multiProcessorCount);
printf(" Global mem: %.0f MB\n",
static_cast<float>(deviceProps.totalGlobalMem) / (1024 * 1024));
printf(" CUDA Cap: %d.%d\n", deviceProps.major, deviceProps.minor);
}
printf("---------------------------------------------------------\n");
}