Skip to content

Commit

Permalink
Merge pull request #144 from gcongiu/2023.12.19_cuda-init-err-handling
Browse files Browse the repository at this point in the history
Cuda component init error handling
  • Loading branch information
gcongiu authored Dec 20, 2023
2 parents 29ed6f2 + c4cacc8 commit f1d5857
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 27 deletions.
86 changes: 61 additions & 25 deletions src/components/cuda/cupti_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ CUresult ( *cuDeviceGetAttributePtr ) (int *, CUdevice_attribute, CUdevice);

cudaError_t ( *cudaGetDeviceCountPtr ) (int *);
cudaError_t ( *cudaGetDevicePtr ) (int *);
const char *( *cudaGetErrorStringPtr ) (cudaError_t);
cudaError_t ( *cudaSetDevicePtr ) (int);
cudaError_t ( *cudaGetDevicePropertiesPtr ) (struct cudaDeviceProp* prop, int device);
cudaError_t ( *cudaDeviceGetAttributePtr ) (int *value, enum cudaDeviceAttr attr, int device);
Expand Down Expand Up @@ -168,6 +169,7 @@ static int load_cudart_sym(void)
cudaGetDevicePtr = DLSYM_AND_CHECK(dl_rt, "cudaGetDevice");
cudaGetDeviceCountPtr = DLSYM_AND_CHECK(dl_rt, "cudaGetDeviceCount");
cudaGetDevicePropertiesPtr = DLSYM_AND_CHECK(dl_rt, "cudaGetDeviceProperties");
cudaGetErrorStringPtr = DLSYM_AND_CHECK(dl_rt, "cudaGetErrorString");
cudaDeviceGetAttributePtr = DLSYM_AND_CHECK(dl_rt, "cudaDeviceGetAttribute");
cudaSetDevicePtr = DLSYM_AND_CHECK(dl_rt, "cudaSetDevice");
cudaFreePtr = DLSYM_AND_CHECK(dl_rt, "cudaFree");
Expand All @@ -191,6 +193,7 @@ static int unload_cudart_sym(void)
cudaGetDevicePtr = NULL;
cudaGetDeviceCountPtr = NULL;
cudaGetDevicePropertiesPtr = NULL;
cudaGetErrorStringPtr = NULL;
cudaDeviceGetAttributePtr = NULL;
cudaSetDevicePtr = NULL;
cudaFreePtr = NULL;
Expand Down Expand Up @@ -297,45 +300,57 @@ static int util_dylib_cupti_version(void)
return cuptiVersion;
}

int cuptic_device_get_count(void)
int cuptic_device_get_count(int *num_gpus)
{
static int numDevs = -1;
if (numDevs != -1) {
goto fn_exit;
cudaError_t cuda_errno = cudaGetDeviceCountPtr(num_gpus);
if (cuda_errno != cudaSuccess) {
cuptic_disabled_reason_set(cudaGetErrorStringPtr(cuda_errno));
return PAPI_EMISC;
}
CUDART_CALL(cudaGetDeviceCountPtr(&numDevs), return PAPI_EMISC);
fn_exit:
return numDevs;
return PAPI_OK;
}

static int get_gpu_compute_capability(int dev_num)
static int get_gpu_compute_capability(int dev_num, int *cc)
{
int cc_major, cc_minor;
int cc;
CUDART_CALL(cudaDeviceGetAttributePtr(&cc_major,
cudaDevAttrComputeCapabilityMajor, dev_num),
return PAPI_EMISC );
CUDART_CALL(cudaDeviceGetAttributePtr(&cc_minor,
cudaDevAttrComputeCapabilityMinor, dev_num),
return PAPI_EMISC );
cc = cc_major * 10 + cc_minor;
return cc;
cudaError_t cuda_errno;
cuda_errno = cudaDeviceGetAttributePtr(&cc_major, cudaDevAttrComputeCapabilityMajor, dev_num);
if (cuda_errno != cudaSuccess) {
cuptic_disabled_reason_set(cudaGetErrorStringPtr(cuda_errno));
return PAPI_EMISC;
}
cuda_errno = cudaDeviceGetAttributePtr(&cc_minor, cudaDevAttrComputeCapabilityMinor, dev_num);
if (cuda_errno != cudaSuccess) {
cuptic_disabled_reason_set(cudaGetErrorStringPtr(cuda_errno));
return PAPI_EMISC;
}
*cc = cc_major * 10 + cc_minor;
return PAPI_OK;
}

typedef enum {GPU_COLLECTION_UNKNOWN, GPU_COLLECTION_ALL_PERF, GPU_COLLECTION_MIXED, GPU_COLLECTION_ALL_EVENTS, GPU_COLLECTION_ALL_CC70} gpu_collection_e;

static gpu_collection_e util_gpu_collection_kind(void)
static int util_gpu_collection_kind(gpu_collection_e *coll_kind)
{
int papi_errno = PAPI_OK;
static gpu_collection_e kind = GPU_COLLECTION_UNKNOWN;
if (kind != GPU_COLLECTION_UNKNOWN) {
goto fn_exit;
}

int total_gpus = cuptic_device_get_count();
int total_gpus;
papi_errno = cuptic_device_get_count(&total_gpus);
if (papi_errno != PAPI_OK) {
goto fn_exit;
}

int i, cc;
int count_perf = 0, count_evt = 0, count_cc70 = 0;
for (i=0; i<total_gpus; i++) {
cc = get_gpu_compute_capability(i);
papi_errno = get_gpu_compute_capability(i, &cc);
if (papi_errno != PAPI_OK) {
return papi_errno;
}
if (cc == 70) {
++count_cc70;
}
Expand All @@ -361,7 +376,8 @@ static gpu_collection_e util_gpu_collection_kind(void)
kind = GPU_COLLECTION_MIXED;

fn_exit:
return kind;
*coll_kind = kind;
return papi_errno;
}

const char *cuptic_disabled_reason_g;
Expand Down Expand Up @@ -413,7 +429,13 @@ int cuptic_init(void)
goto fn_exit;
}

if (util_gpu_collection_kind() == GPU_COLLECTION_MIXED) {
gpu_collection_e kind;
papi_errno = util_gpu_collection_kind(&kind);
if (papi_errno != PAPI_OK) {
goto fn_exit;
}

if (kind == GPU_COLLECTION_MIXED) {
cuptic_disabled_reason_set("No support for systems with mixed compute capabilities, such as CC < 7.0 and CC > 7.0 GPUS.");
papi_errno = PAPI_ECMP;
goto fn_exit;
Expand All @@ -430,7 +452,12 @@ int cuptic_is_runtime_perfworks_api(void)
}
char *papi_cuda_110_cc70_perfworks_api = getenv("PAPI_CUDA_110_CC_70_PERFWORKS_API");

gpu_collection_e gpus_kind = util_gpu_collection_kind();
gpu_collection_e gpus_kind;
int papi_errno = util_gpu_collection_kind(&gpus_kind);
if (papi_errno != PAPI_OK) {
goto fn_exit;
}

unsigned int cuptiVersion = util_dylib_cupti_version();

if (gpus_kind == GPU_COLLECTION_ALL_CC70 &&
Expand Down Expand Up @@ -465,7 +492,11 @@ int cuptic_is_runtime_events_api(void)
goto fn_exit;
}

gpu_collection_e gpus_kind = util_gpu_collection_kind();
gpu_collection_e gpus_kind;
int papi_errno = util_gpu_collection_kind(&gpus_kind);
if (papi_errno != PAPI_OK) {
goto fn_exit;
}

/*
* See cupti_config.h: When NVIDIA removes the events API add a check in the following condition
Expand All @@ -489,7 +520,12 @@ struct cuptic_info {
int cuptic_ctxarr_create(cuptic_info_t *pinfo)
{
COMPDBG("Entering.\n");
cuptic_info_t cuCtx = (cuptic_info_t) papi_calloc (cuptic_device_get_count(), sizeof(*pinfo));
int total_gpus;
int papi_errno = cuptic_device_get_count(&total_gpus);
if (papi_errno != PAPI_OK) {
return PAPI_EMISC;
}
cuptic_info_t cuCtx = (cuptic_info_t) papi_calloc (total_gpus, sizeof(*pinfo));
if (cuCtx == NULL) {
return PAPI_ENOMEM;
}
Expand Down
2 changes: 1 addition & 1 deletion src/components/cuda/cupti_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ void cuptic_disabled_reason_get(const char **pmsg);

void *cuptic_load_dynamic_syms(const char *parent_path, const char *dlname, const char *search_subpaths[]);
int cuptic_shutdown(void);
int cuptic_device_get_count(void);
int cuptic_device_get_count(int *num_gpus);
int cuptic_init(void);
int cuptic_is_runtime_perfworks_api(void);
int cuptic_is_runtime_events_api(void);
Expand Down
7 changes: 6 additions & 1 deletion src/components/cuda/cupti_profiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -1391,7 +1391,12 @@ int cuptip_init(void)
cuptic_disabled_reason_set("Unable to load CUDA library functions.");
goto fn_fail;
}
num_gpus = cuptic_device_get_count();

papi_errno = cuptic_device_get_count(&num_gpus);
if (papi_errno != PAPI_OK) {
goto fn_fail;
}

if (num_gpus <= 0) {
cuptic_disabled_reason_set("No GPUs found on system.");
goto fn_fail;
Expand Down

0 comments on commit f1d5857

Please sign in to comment.