-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
style(cuda): reformat the function/variable name and add some comments
- Loading branch information
Showing
5 changed files
with
92 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
67 changes: 43 additions & 24 deletions
67
template/cuda/{{cookiecutter.project_slug}}/src/matrix_add.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,39 +1,58 @@ | ||
#include "cuda_utils.h" | ||
|
||
__global__ void matrixAdd(const float* A, const float* B, float* C, int rows, int cols) { | ||
int row = blockIdx.y * blockDim.y + threadIdx.y; | ||
int col = blockIdx.x * blockDim.x + threadIdx.x; | ||
|
||
if (row < rows && col < cols) { | ||
int idx = row * cols + col; | ||
C[idx] = A[idx] + B[idx]; | ||
// CUDA kernel for adding two matrices element-wise | ||
__global__ void addMatricesKernel(const float* matrixA, const float* matrixB, float* resultMatrix, int numRows, int numCols) { | ||
// Calculate the global row and column indices for this thread | ||
int rowIndex = blockIdx.y * blockDim.y + threadIdx.y; | ||
int colIndex = blockIdx.x * blockDim.x + threadIdx.x; | ||
|
||
// Check if this thread is within the matrix bounds | ||
if (rowIndex < numRows && colIndex < numCols) { | ||
// Calculate the linear index for the current element | ||
int elementIndex = rowIndex * numCols + colIndex; | ||
// Perform element-wise addition | ||
resultMatrix[elementIndex] = matrixA[elementIndex] + matrixB[elementIndex]; | ||
} | ||
} | ||
|
||
void matrixAddHost(const float* A, const float* B, float* C, int rows, int cols) { | ||
size_t size = rows * cols * sizeof(float); | ||
// Host function to set up and execute matrix addition on GPU | ||
void addMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float* hostResultMatrix, int numRows, int numCols) { | ||
// Calculate total size of the matrices in bytes | ||
size_t matrixSizeBytes = numRows * numCols * sizeof(float); | ||
|
||
float* d_A; | ||
float* d_B; | ||
float* d_C; | ||
// Declare pointers for device (GPU) memory | ||
float* deviceMatrixA; | ||
float* deviceMatrixB; | ||
float* deviceResultMatrix; | ||
|
||
CUDA_CHECK(cudaMalloc(&d_A, size)); | ||
CUDA_CHECK(cudaMalloc(&d_B, size)); | ||
CUDA_CHECK(cudaMalloc(&d_C, size)); | ||
// Allocate memory on the GPU | ||
CUDA_CHECK(cudaMalloc(&deviceMatrixA, matrixSizeBytes)); | ||
CUDA_CHECK(cudaMalloc(&deviceMatrixB, matrixSizeBytes)); | ||
CUDA_CHECK(cudaMalloc(&deviceResultMatrix, matrixSizeBytes)); | ||
|
||
CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice)); | ||
CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice)); | ||
// Copy input matrices from host to device | ||
CUDA_CHECK(cudaMemcpy(deviceMatrixA, hostMatrixA, matrixSizeBytes, cudaMemcpyHostToDevice)); | ||
CUDA_CHECK(cudaMemcpy(deviceMatrixB, hostMatrixB, matrixSizeBytes, cudaMemcpyHostToDevice)); | ||
|
||
// Define the grid and block dimensions | ||
dim3 threadsPerBlock(16, 16); | ||
dim3 blocksPerGrid((cols + threadsPerBlock.x - 1) / threadsPerBlock.x, | ||
(rows + threadsPerBlock.y - 1) / threadsPerBlock.y); | ||
dim3 numBlocks((numCols + threadsPerBlock.x - 1) / threadsPerBlock.x, | ||
(numRows + threadsPerBlock.y - 1) / threadsPerBlock.y); | ||
|
||
// Launch the CUDA kernel | ||
addMatricesKernel<<<numBlocks, threadsPerBlock>>>(deviceMatrixA, deviceMatrixB, deviceResultMatrix, numRows, numCols); | ||
|
||
// Check for errors | ||
CUDA_CHECK(cudaGetLastError()); | ||
|
||
matrixAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, rows, cols); | ||
// Wait for GPU to finish | ||
CUDA_CHECK(cudaDeviceSynchronize()); | ||
|
||
CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost)); | ||
// Copy the result back to host memory | ||
CUDA_CHECK(cudaMemcpy(hostResultMatrix, deviceResultMatrix, matrixSizeBytes, cudaMemcpyDeviceToHost)); | ||
|
||
CUDA_CHECK(cudaFree(d_A)); | ||
CUDA_CHECK(cudaFree(d_B)); | ||
CUDA_CHECK(cudaFree(d_C)); | ||
// Free GPU memory | ||
CUDA_CHECK(cudaFree(deviceMatrixA)); | ||
CUDA_CHECK(cudaFree(deviceMatrixB)); | ||
CUDA_CHECK(cudaFree(deviceResultMatrix)); | ||
} |
79 changes: 44 additions & 35 deletions
79
template/cuda/{{cookiecutter.project_slug}}/src/matrix_mult.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +1,49 @@ | ||
#include "cuda_utils.h" | ||
|
||
void matrixMultHost(const float* A, const float* B, float* C, int rowsA, int colsA, int colsB) { | ||
size_t sizeA = rowsA * colsA * sizeof(float); | ||
size_t sizeB = colsA * colsB * sizeof(float); | ||
size_t sizeC = rowsA * colsB * sizeof(float); | ||
|
||
float* d_A; | ||
float* d_B; | ||
float* d_C; | ||
|
||
CUDA_CHECK(cudaMalloc(&d_A, sizeA)); | ||
CUDA_CHECK(cudaMalloc(&d_B, sizeB)); | ||
CUDA_CHECK(cudaMalloc(&d_C, sizeC)); | ||
|
||
CUDA_CHECK(cudaMemcpy(d_A, A, sizeA, cudaMemcpyHostToDevice)); | ||
CUDA_CHECK(cudaMemcpy(d_B, B, sizeB, cudaMemcpyHostToDevice)); | ||
|
||
cublasHandle_t handle; | ||
CUBLAS_CHECK(cublasCreate(&handle)); | ||
|
||
// Function to perform matrix multiplication on GPU using cuBLAS | ||
void multiplyMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float* hostResultMatrix, | ||
int numRowsA, int numColsA, int numColsB) { | ||
// Calculate sizes in bytes for each matrix | ||
size_t byteSizeA = numRowsA * numColsA * sizeof(float); | ||
size_t byteSizeB = numColsA * numColsB * sizeof(float); | ||
size_t byteSizeC = numRowsA * numColsB * sizeof(float); | ||
|
||
// Declare pointers for device (GPU) memory | ||
float *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix; | ||
|
||
// Allocate memory on the GPU | ||
CUDA_CHECK(cudaMalloc(&deviceMatrixA, byteSizeA)); | ||
CUDA_CHECK(cudaMalloc(&deviceMatrixB, byteSizeB)); | ||
CUDA_CHECK(cudaMalloc(&deviceResultMatrix, byteSizeC)); | ||
|
||
// Copy input matrices from host to device | ||
CUDA_CHECK(cudaMemcpy(deviceMatrixA, hostMatrixA, byteSizeA, cudaMemcpyHostToDevice)); | ||
CUDA_CHECK(cudaMemcpy(deviceMatrixB, hostMatrixB, byteSizeB, cudaMemcpyHostToDevice)); | ||
|
||
// Create cuBLAS handle | ||
cublasHandle_t cublasHandle; | ||
CUBLAS_CHECK(cublasCreate(&cublasHandle)); | ||
|
||
// Set up parameters for cublasSgemm | ||
const float alpha = 1.0f; | ||
const float beta = 0.0f; | ||
CUBLAS_CHECK(cublasSgemm(handle, | ||
CUBLAS_OP_N, CUBLAS_OP_N, | ||
colsB, rowsA, colsA, | ||
&alpha, | ||
d_B, colsB, | ||
d_A, colsA, | ||
&beta, | ||
d_C, colsB)); | ||
|
||
CUDA_CHECK(cudaMemcpy(C, d_C, sizeC, cudaMemcpyDeviceToHost)); | ||
|
||
CUDA_CHECK(cudaFree(d_A)); | ||
CUDA_CHECK(cudaFree(d_B)); | ||
CUDA_CHECK(cudaFree(d_C)); | ||
|
||
CUBLAS_CHECK(cublasDestroy(handle)); | ||
|
||
// Perform matrix multiplication using cuBLAS | ||
CUBLAS_CHECK(cublasSgemm(cublasHandle, | ||
CUBLAS_OP_N, CUBLAS_OP_N, | ||
numColsB, numRowsA, numColsA, | ||
&alpha, | ||
deviceMatrixB, numColsB, | ||
deviceMatrixA, numColsA, | ||
&beta, | ||
deviceResultMatrix, numColsB)); | ||
|
||
// Copy the result back to host memory | ||
CUDA_CHECK(cudaMemcpy(hostResultMatrix, deviceResultMatrix, byteSizeC, cudaMemcpyDeviceToHost)); | ||
|
||
// Clean up: Free GPU memory and destroy cuBLAS handle | ||
CUDA_CHECK(cudaFree(deviceMatrixA)); | ||
CUDA_CHECK(cudaFree(deviceMatrixB)); | ||
CUDA_CHECK(cudaFree(deviceResultMatrix)); | ||
CUBLAS_CHECK(cublasDestroy(cublasHandle)); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters