Skip to content

Commit

Permalink
style(cuda): reformat the function/variable name and add some comments
Browse files Browse the repository at this point in the history
  • Loading branch information
pplmx committed Sep 10, 2024
1 parent 645975f commit 11949fe
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 64 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

#include <cuda_runtime.h>

__global__ void matrixAdd(const float* A, const float* B, float* C, int rows, int cols);
void matrixAddHost(const float* A, const float* B, float* C, int rows, int cols);
__global__ void addMatricesKernel(const float* A, const float* B, float* C, int rows, int cols);
void addMatricesOnGPU(const float* A, const float* B, float* C, int rows, int cols);
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
#include <cuda_runtime.h>
#include <cublas_v2.h>

void matrixMultHost(const float* A, const float* B, float* C, int rowsA, int colsA, int colsB);
void multiplyMatricesOnGPU(const float* A, const float* B, float* C, int rowsA, int colsA, int colsB);
67 changes: 43 additions & 24 deletions template/cuda/{{cookiecutter.project_slug}}/src/matrix_add.cu
Original file line number Diff line number Diff line change
@@ -1,39 +1,58 @@
#include "cuda_utils.h"

__global__ void matrixAdd(const float* A, const float* B, float* C, int rows, int cols) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < rows && col < cols) {
int idx = row * cols + col;
C[idx] = A[idx] + B[idx];
// CUDA kernel for adding two matrices element-wise
__global__ void addMatricesKernel(const float* matrixA, const float* matrixB, float* resultMatrix, int numRows, int numCols) {
// Calculate the global row and column indices for this thread
int rowIndex = blockIdx.y * blockDim.y + threadIdx.y;
int colIndex = blockIdx.x * blockDim.x + threadIdx.x;

// Check if this thread is within the matrix bounds
if (rowIndex < numRows && colIndex < numCols) {
// Calculate the linear index for the current element
int elementIndex = rowIndex * numCols + colIndex;
// Perform element-wise addition
resultMatrix[elementIndex] = matrixA[elementIndex] + matrixB[elementIndex];
}
}

void matrixAddHost(const float* A, const float* B, float* C, int rows, int cols) {
size_t size = rows * cols * sizeof(float);
// Host function to set up and execute matrix addition on GPU
void addMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float* hostResultMatrix, int numRows, int numCols) {
// Calculate total size of the matrices in bytes
size_t matrixSizeBytes = numRows * numCols * sizeof(float);

float* d_A;
float* d_B;
float* d_C;
// Declare pointers for device (GPU) memory
float* deviceMatrixA;
float* deviceMatrixB;
float* deviceResultMatrix;

CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));
// Allocate memory on the GPU
CUDA_CHECK(cudaMalloc(&deviceMatrixA, matrixSizeBytes));
CUDA_CHECK(cudaMalloc(&deviceMatrixB, matrixSizeBytes));
CUDA_CHECK(cudaMalloc(&deviceResultMatrix, matrixSizeBytes));

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));
// Copy input matrices from host to device
CUDA_CHECK(cudaMemcpy(deviceMatrixA, hostMatrixA, matrixSizeBytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(deviceMatrixB, hostMatrixB, matrixSizeBytes, cudaMemcpyHostToDevice));

// Define the grid and block dimensions
dim3 threadsPerBlock(16, 16);
dim3 blocksPerGrid((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
(rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
dim3 numBlocks((numCols + threadsPerBlock.x - 1) / threadsPerBlock.x,
(numRows + threadsPerBlock.y - 1) / threadsPerBlock.y);

// Launch the CUDA kernel
addMatricesKernel<<<numBlocks, threadsPerBlock>>>(deviceMatrixA, deviceMatrixB, deviceResultMatrix, numRows, numCols);

// Check for errors
CUDA_CHECK(cudaGetLastError());

matrixAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, rows, cols);
// Wait for GPU to finish
CUDA_CHECK(cudaDeviceSynchronize());

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));
// Copy the result back to host memory
CUDA_CHECK(cudaMemcpy(hostResultMatrix, deviceResultMatrix, matrixSizeBytes, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
// Free GPU memory
CUDA_CHECK(cudaFree(deviceMatrixA));
CUDA_CHECK(cudaFree(deviceMatrixB));
CUDA_CHECK(cudaFree(deviceResultMatrix));
}
79 changes: 44 additions & 35 deletions template/cuda/{{cookiecutter.project_slug}}/src/matrix_mult.cu
Original file line number Diff line number Diff line change
@@ -1,40 +1,49 @@
#include "cuda_utils.h"

void matrixMultHost(const float* A, const float* B, float* C, int rowsA, int colsA, int colsB) {
size_t sizeA = rowsA * colsA * sizeof(float);
size_t sizeB = colsA * colsB * sizeof(float);
size_t sizeC = rowsA * colsB * sizeof(float);

float* d_A;
float* d_B;
float* d_C;

CUDA_CHECK(cudaMalloc(&d_A, sizeA));
CUDA_CHECK(cudaMalloc(&d_B, sizeB));
CUDA_CHECK(cudaMalloc(&d_C, sizeC));

CUDA_CHECK(cudaMemcpy(d_A, A, sizeA, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_B, B, sizeB, cudaMemcpyHostToDevice));

cublasHandle_t handle;
CUBLAS_CHECK(cublasCreate(&handle));

// Function to perform matrix multiplication on GPU using cuBLAS
void multiplyMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float* hostResultMatrix,
int numRowsA, int numColsA, int numColsB) {
// Calculate sizes in bytes for each matrix
size_t byteSizeA = numRowsA * numColsA * sizeof(float);
size_t byteSizeB = numColsA * numColsB * sizeof(float);
size_t byteSizeC = numRowsA * numColsB * sizeof(float);

// Declare pointers for device (GPU) memory
float *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;

// Allocate memory on the GPU
CUDA_CHECK(cudaMalloc(&deviceMatrixA, byteSizeA));
CUDA_CHECK(cudaMalloc(&deviceMatrixB, byteSizeB));
CUDA_CHECK(cudaMalloc(&deviceResultMatrix, byteSizeC));

// Copy input matrices from host to device
CUDA_CHECK(cudaMemcpy(deviceMatrixA, hostMatrixA, byteSizeA, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(deviceMatrixB, hostMatrixB, byteSizeB, cudaMemcpyHostToDevice));

// Create cuBLAS handle
cublasHandle_t cublasHandle;
CUBLAS_CHECK(cublasCreate(&cublasHandle));

// Set up parameters for cublasSgemm
const float alpha = 1.0f;
const float beta = 0.0f;
CUBLAS_CHECK(cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
colsB, rowsA, colsA,
&alpha,
d_B, colsB,
d_A, colsA,
&beta,
d_C, colsB));

CUDA_CHECK(cudaMemcpy(C, d_C, sizeC, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));

CUBLAS_CHECK(cublasDestroy(handle));

// Perform matrix multiplication using cuBLAS
CUBLAS_CHECK(cublasSgemm(cublasHandle,
CUBLAS_OP_N, CUBLAS_OP_N,
numColsB, numRowsA, numColsA,
&alpha,
deviceMatrixB, numColsB,
deviceMatrixA, numColsA,
&beta,
deviceResultMatrix, numColsB));

// Copy the result back to host memory
CUDA_CHECK(cudaMemcpy(hostResultMatrix, deviceResultMatrix, byteSizeC, cudaMemcpyDeviceToHost));

// Clean up: Free GPU memory and destroy cuBLAS handle
CUDA_CHECK(cudaFree(deviceMatrixA));
CUDA_CHECK(cudaFree(deviceMatrixB));
CUDA_CHECK(cudaFree(deviceResultMatrix));
CUBLAS_CHECK(cublasDestroy(cublasHandle));
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ TEST(MatrixOperations, AddTest) {
float B[rows * cols] = {5, 6, 7, 8};
float C[rows * cols] = {0};

matrixAddHost(A, B, C, rows, cols);
addMatricesOnGPU(A, B, C, rows, cols);

float expected[rows * cols] = {6, 8, 10, 12};
for (int i = 0; i < rows * cols; i++) {
Expand All @@ -26,7 +26,7 @@ TEST(MatrixOperations, MultTest) {
float B[colsA * colsB] = {5, 6, 7, 8};
float C[rowsA * colsB] = {0};

matrixMultHost(A, B, C, rowsA, colsA, colsB);
multiplyMatricesOnGPU(A, B, C, rowsA, colsA, colsB);

float expected[rowsA * colsB] = {19, 22, 43, 50};
for (int i = 0; i < rowsA * colsB; i++) {
Expand Down

0 comments on commit 11949fe

Please sign in to comment.