From 6b817c9e54b0bacb63ec37578b3a2b1c8bf7d151 Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Fri, 3 Jan 2025 16:43:50 +0000 Subject: [PATCH 01/16] Multi gpu support --- src/components/cuda/cupti_profiler.c | 210 ++++++++++++++++-------- src/components/cuda/papi_cupti_common.c | 1 + 2 files changed, 147 insertions(+), 64 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index 31bae0ca1..b7509aa9a 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -91,15 +91,15 @@ struct list_metrics_s { char chip_name[32]; MCCP_t *pmetricsContextCreateParams; int num_metrics; - cuptiu_event_table_t *nv_metrics; + const char* const* metric_names; + cuptiu_event_table_t *cuptiu_table_p; }; static void *dl_nvpw; static int num_gpus; -static int num_unique_gpus = 1; +//static int num_unique_gpus = 2; static list_metrics_t *avail_events; -static cuptiu_event_table_t cuptiu_table; static cuptiu_event_table_t *cuptiu_table_p; /* load and unload cuda function pointers */ @@ -118,6 +118,7 @@ static int initialize_perfworks_api(void); /* utility functions to init metrics and cuda native event table */ static int init_all_metrics(void); static int init_event_table(void); +static void init_main_htable(void); static int shutdown_event_table(void); static void free_all_enumerated_metrics(void); @@ -146,7 +147,8 @@ static int calculate_num_passes(struct NVPA_RawMetricsConfig *pRawMetricsConfig, /* functions to set and get cuda native event info or convert cuda native events */ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos, int gpu_id); -static int verify_events(uint64_t *events_id, int num_events, cuptiu_event_table_t **targeted_event_names); +//static int verify_events(uint64_t *events_id, int num_events, cuptiu_event_table_t **targeted_event_names); +static int verify_events(uint64_t *events_id, int num_events, cuptip_control_t state); static int evt_id_to_info(uint64_t event_id, event_info_t *info); static int evt_id_create(event_info_t *info, uint64_t *event_id); static int evt_code_to_name(uint64_t event_code, char *name, int len); @@ -631,7 +633,7 @@ static int nvpw_cuda_metricscontext_create(cuptip_control_t state) seee cuptip_gpu_state_s */ cuptip_gpu_state_t *gpu_ctl; - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); found = find_same_chipname(gpu_id); if (found > -1) { @@ -677,7 +679,7 @@ static int nvpw_cuda_metricscontext_destroy(cuptip_control_t state) int gpu_id, found, papi_errno = PAPI_OK; cuptip_gpu_state_t *gpu_ctl; - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); found = find_same_chipname(gpu_id); if (found > -1) { @@ -714,7 +716,7 @@ static int check_multipass(cuptip_control_t state) NVPA_Status nvpa_err; cuptip_gpu_state_t *gpu_ctl; - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); if (gpu_ctl->event_names->count == 0) { continue; @@ -1297,19 +1299,19 @@ static int find_same_chipname(int gpu_id) static int init_all_metrics(void) { int gpu_id, papi_errno = PAPI_OK; - avail_events = (list_metrics_t *) papi_calloc(num_unique_gpus, sizeof(list_metrics_t)); + avail_events = (list_metrics_t *) papi_calloc(num_gpus, sizeof(list_metrics_t)); if (avail_events == NULL) { papi_errno = PAPI_ENOMEM; goto fn_exit; } - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { papi_errno = get_chip_name(gpu_id, avail_events[gpu_id].chip_name); if (papi_errno != PAPI_OK) { goto fn_exit; } } int found; - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { found = find_same_chipname(gpu_id); if (found > -1) { avail_events[gpu_id].pmetricsContextCreateParams = avail_events[found].pmetricsContextCreateParams; @@ -1345,11 +1347,11 @@ static void free_all_enumerated_metrics(void) if (avail_events == NULL) { return; } - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { found = find_same_chipname(gpu_id); if (found > -1) { avail_events[gpu_id].num_metrics = 0; - avail_events[gpu_id].nv_metrics = NULL; + avail_events[gpu_id].cuptiu_table_p = NULL; avail_events[gpu_id].pmetricsContextCreateParams = NULL; continue; } @@ -1364,14 +1366,29 @@ static void free_all_enumerated_metrics(void) papi_free(avail_events[gpu_id].pmetricsContextCreateParams); avail_events[gpu_id].pmetricsContextCreateParams = NULL; - if (avail_events[gpu_id].nv_metrics) { - cuptiu_event_table_destroy( &(avail_events[gpu_id].nv_metrics) ); + if (avail_events[gpu_id].cuptiu_table_p) { + cuptiu_event_table_destroy( &(avail_events[gpu_id].cuptiu_table_p) ); } } papi_free(avail_events); avail_events = NULL; } +static void init_main_htable(void) +{ + int htable_errno; + + /* capacity is set to 2097152 as this is + the maximum number of events we allow as of now */ + cuptiu_table_p = papi_malloc(sizeof(cuptiu_event_table_t)); + cuptiu_table_p->capacity = 2097152; + cuptiu_table_p->count = 0; + + htable_init(&cuptiu_table_p->htable); + + cuptiu_table_p->events = papi_calloc(2097152, sizeof(cuptiu_event_t)); +} + /** @class cuptip_init * @brief Load and initialize API's. */ @@ -1392,7 +1409,7 @@ int cuptip_init(void) if (papi_errno != PAPI_OK) { goto fn_fail; } - + /* if no gpu's are found exit */ if (num_gpus <= 0) { cuptic_disabled_reason_set("No GPUs found on system."); @@ -1411,15 +1428,19 @@ int cuptip_init(void) if (papi_errno != PAPI_OK) { goto fn_fail; } + papi_errno = cuInitPtr(0); if (papi_errno != CUDA_SUCCESS) { cuptic_disabled_reason_set("Failed to initialize CUDA driver API."); goto fn_fail; } + + /* initialize main hash table to store entries */ + init_main_htable(); + /* initialize hash table with cuda native events */ init_event_table(); - cuptiu_table_p = &cuptiu_table; return PAPI_OK; fn_fail: @@ -1436,6 +1457,7 @@ int cuptip_init(void) * @param **targeted_event_names * Event table to hold subset of user added events. */ +/* int verify_events(uint64_t *events_id, int num_events, cuptiu_event_table_t **targeted_event_names) { @@ -1469,6 +1491,42 @@ int verify_events(uint64_t *events_id, int num_events, fn_exit: return papi_errno; } +*/ +int verify_events(uint64_t *events_id, int num_events, + cuptip_control_t state) +{ + int papi_errno = PAPI_OK, i; + char name[PAPI_MAX_STR_LEN] = { 0 }; + + for (i = 0; i < num_gpus; i++) { + papi_errno = cuptiu_event_table_create_init_capacity( + num_events * num_gpus, + sizeof(cuptiu_event_t), &(state->gpu_ctl[i].event_names) + ); + if (papi_errno != PAPI_OK) { + goto fn_exit; + } + } + + for (i = 0; i < num_events; i++) { + event_info_t info; + papi_errno = evt_id_to_info(events_id[i], &info); + if (papi_errno != PAPI_OK) { + break; + } + sprintf(name, "%s", cuptiu_table_p->events[info.nameid].name); + strcpy(state->gpu_ctl[info.device].event_names->added_cuda_evts[i], name); + state->gpu_ctl[info.device].event_names->added_cuda_dev[i] = info.device; + void *p; + if (htable_find(cuptiu_table_p->htable, name, (void **) &p) != HTABLE_SUCCESS) { + htable_insert(state->gpu_ctl[info.device].event_names->htable, name, (void **) &p ); + } + state->gpu_ctl[info.device].event_names->count++; + } + + fn_exit: + return papi_errno; +} /** @class cuptip_ctx_create * @brief Create a profiling context for the requested Cuda events. @@ -1489,10 +1547,10 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t char name[PAPI_2MAX_STR_LEN] = { 0 }; cuptiu_event_table_t *targeted_event_names; - papi_errno = verify_events(events_id, num_events, &targeted_event_names); - if (papi_errno != PAPI_OK) { - return papi_errno; - } + //papi_errno = verify_events(events_id, num_events, &targeted_event_names); + //if (papi_errno != PAPI_OK) { + // return papi_errno; + //} /* create a cuptip_control_t struct which contains read_count, running, cupti_info_t and cuptip_gpu_state_t */ cuptip_control_t state = (cuptip_control_t) papi_calloc (1, sizeof(struct cuptip_control_s)); @@ -1502,7 +1560,7 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t /* allocate memory for the total number of gpus for the cuptip_gpu_state_t struct with the device qualifier refactor we only want to count the total number of unique gpus */ - state->gpu_ctl = (cuptip_gpu_state_t *) papi_calloc(num_unique_gpus, sizeof(cuptip_gpu_state_t)); + state->gpu_ctl = (cuptip_gpu_state_t *) papi_calloc(num_gpus, sizeof(cuptip_gpu_state_t)); if (state->gpu_ctl == NULL) { return PAPI_ENOMEM; } @@ -1510,9 +1568,9 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t counters = papi_malloc(num_events * sizeof(*counters)); /* for each unique gpu store the gpu id for that gpu index */ - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { state->gpu_ctl[gpu_id].gpu_id = gpu_id; - state->gpu_ctl[gpu_id].event_names = targeted_event_names; + //state->gpu_ctl[gpu_id].event_names = targeted_event_names; } /* register the user created cuda context for the current gpu if not already known */ @@ -1527,6 +1585,11 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t goto fn_exit; } + papi_errno = verify_events(events_id, num_events, state); + if (papi_errno != PAPI_OK) { + goto fn_exit; + } + /* multipass is not supporter; therefore, we must check the Cuda native event */ papi_errno = check_multipass(state); if (papi_errno != PAPI_OK) { @@ -1565,7 +1628,8 @@ int cuptip_ctx_start(cuptip_control_t state) } /* enumerate through all of the unique gpus */ - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { + printf("gpu_id is: %d\n", gpu_id); gpu_ctl = &(state->gpu_ctl[gpu_id]); if (gpu_ctl->event_names->count == 0) { continue; @@ -1593,12 +1657,14 @@ int cuptip_ctx_start(cuptip_control_t state) papi_errno += metric_get_counter_data_prefix_image(gpu_ctl); papi_errno += create_counter_data_image(gpu_ctl); if (papi_errno != PAPI_OK) { + printf("cupti profiler host configuration.\n"); ERRDBG("Failed to create CUPTI profiler state for gpu %d\n", gpu_id); goto fn_fail; } papi_errno = begin_profiling(gpu_ctl); if (papi_errno != PAPI_OK) { + printf("begin profiling failed.\n"); ERRDBG("Failed to start profiling for gpu %d\n", gpu_id); goto fn_fail; } @@ -1638,7 +1704,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) cudaArtCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); } - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); if (gpu_ctl->event_names->count == 0) { continue; @@ -1784,7 +1850,7 @@ int cuptip_ctx_stop(cuptip_control_t state) cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); } - for (gpu_id=0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id=0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); if (gpu_ctl->event_names->count == 0) { continue; @@ -1823,7 +1889,7 @@ int cuptip_ctx_destroy(cuptip_control_t *pstate) cuptip_control_t state = *pstate; int i, j; int papi_errno = nvpw_cuda_metricscontext_destroy(state); - for (i = 0; i < num_unique_gpus; i++) { + for (i = 0; i < num_gpus; i++) { reset_cupti_prof_config_images( &(state->gpu_ctl[i]) ); cuptiu_event_table_destroy( &(state->gpu_ctl[i].event_names) ); for (j = 0; j < state->gpu_ctl[i].rmr_count; j++) { @@ -1931,49 +1997,64 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info) */ int init_event_table(void) { - int gpu_idx, dev_id, i, listsubmetrics = 1, papi_errno = PAPI_OK; - - for (gpu_idx = 0; gpu_idx < num_unique_gpus; gpu_idx++) { - NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = { - .structSize = NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE, - .pPriv = NULL, - .pMetricsContext = avail_events[gpu_idx].pmetricsContextCreateParams->pMetricsContext, - .hidePeakSubMetrics = !listsubmetrics, - .hidePerCycleSubMetrics = !listsubmetrics, - .hidePctOfPeakSubMetrics = !listsubmetrics, - }; - nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_BeginPtr(&getMetricNameBeginParams), goto fn_fail ); - - avail_events[gpu_idx].num_metrics = getMetricNameBeginParams.numMetrics; - cuptiu_table.events = papi_calloc(avail_events[gpu_idx].num_metrics, sizeof(cuptiu_event_t)); - if (cuptiu_table.events == NULL) { - papi_errno = PAPI_ENOMEM; - goto fn_fail; + int dev_id, found, table_idx = 0; + int gpu_idx, i, listsubmetrics = 1, papi_errno = PAPI_OK; + + /* instatiate struct to collect the total metric count and metric names; + instantiated here to avoid scoping issues */ + NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = { NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE }; + + /* loop through all available devices on the current system */ + for (dev_id = 0; dev_id < num_gpus; dev_id++) { + found = find_same_chipname(dev_id); + /* unique device found, collect metadata */ + if (found == -1) { + /* increment table index */ + if (dev_id > 0) + table_idx++; + + /* assigning values to member variables */ + getMetricNameBeginParams.pPriv = NULL; + getMetricNameBeginParams.pMetricsContext = avail_events[table_idx].pmetricsContextCreateParams->pMetricsContext; + getMetricNameBeginParams.hidePeakSubMetrics = !listsubmetrics; + getMetricNameBeginParams.hidePerCycleSubMetrics = !listsubmetrics; + getMetricNameBeginParams.hidePctOfPeakSubMetrics = !listsubmetrics; + + nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_BeginPtr(&getMetricNameBeginParams), goto fn_fail ); + + /* for each unique device found, store both the total number of metrics and metric names */ + avail_events[table_idx].num_metrics = getMetricNameBeginParams.numMetrics; + avail_events[table_idx].metric_names = getMetricNameBeginParams.ppMetricNames; + + papi_errno = cuptiu_event_table_create_init_capacity(avail_events[table_idx].num_metrics * num_gpus, sizeof(cuptiu_event_t), &(avail_events[table_idx].cuptiu_table_p)); + if (papi_errno != PAPI_OK) { + goto fn_exit; + } + avail_events[table_idx].cuptiu_table_p->events = papi_calloc(avail_events[table_idx].num_metrics, sizeof(cuptiu_event_t)); } - - papi_errno = cuptiu_event_table_create_init_capacity(avail_events[gpu_idx].num_metrics * num_gpus, sizeof(cuptiu_event_t), &(avail_events[gpu_idx].nv_metrics)); - if (papi_errno != PAPI_OK) { - goto fn_exit; + /* device metadata already collected, set table index */ + else { + /* set table_idx to */ + table_idx = found; } - for (dev_id = 0; dev_id < num_gpus; dev_id++) { - for (i = 0; i < avail_events[gpu_idx].num_metrics; i++) { - papi_errno = get_ntv_events( avail_events[gpu_idx].nv_metrics, - getMetricNameBeginParams.ppMetricNames[i], - i, 0, dev_id ); - if (papi_errno != PAPI_OK) { - goto fn_exit; - } - } + + /* loop through metrics to add to overall event table */ + for (i = 0; i < avail_events[table_idx].num_metrics; i++) { + papi_errno = get_ntv_events( cuptiu_table_p, avail_events[table_idx].metric_names[i], i, 0, dev_id); + //papi_errno = get_ntv_events( avail_events[0].cuptiu_table_p, getMetricNameBeginParams.ppMetricNames[i], i, 0, dev_id); + if (papi_errno != PAPI_OK) + goto fn_exit; } - cuptiu_table.events = papi_realloc(cuptiu_table.events, avail_events[gpu_idx].nv_metrics->count * sizeof(cuptiu_event_t)); - cuptiu_table.count = avail_events[gpu_idx].nv_metrics->count; - cuptiu_table.htable = avail_events[gpu_idx].nv_metrics->htable; + } + + /* free memory */ + for (i = 0; i < table_idx; i++) { NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = { .structSize = NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE, .pPriv = NULL, .pMetricsContext = avail_events[gpu_idx].pmetricsContextCreateParams->pMetricsContext, - }; + }; nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_EndPtr((NVPW_MetricsContext_GetMetricNames_End_Params *) &getMetricNameEndParams), goto fn_fail ); } @@ -2002,8 +2083,9 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, int papi_errno; char description[256]; int *count = &evt_table->count; - cuptiu_event_t *events = cuptiu_table.events; - + //cuptiu_event_t *events = cuptiu_table.events; + cuptiu_event_t *events = evt_table->events; + /* check to see if evt_name argument has been provided */ if (evt_name == NULL) { return PAPI_EINVAL; diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c index f3aeb9e08..f378418af 100644 --- a/src/components/cuda/papi_cupti_common.c +++ b/src/components/cuda/papi_cupti_common.c @@ -714,6 +714,7 @@ int cuptic_device_acquire(cuptiu_event_table_t *evt_table) return papi_errno; } if (bitmask & global_gpu_bitmask) { + printf("We fail inside here.\n"); return PAPI_ECNFLCT; } _papi_hwi_lock(_cuda_lock); From 155a81190ea7566c363121d5e141d4b482b43dbb Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Tue, 7 Jan 2025 03:15:18 +0000 Subject: [PATCH 02/16] Support for multi-gpus tested on system with H100 and V100. --- src/components/cuda/cupti_profiler.c | 48 ++++++++++++++++++---------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index b7509aa9a..c72973c4b 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -97,7 +97,6 @@ struct list_metrics_s { static void *dl_nvpw; static int num_gpus; -//static int num_unique_gpus = 2; static list_metrics_t *avail_events; static cuptiu_event_table_t *cuptiu_table_p; @@ -155,7 +154,7 @@ static int evt_code_to_name(uint64_t event_code, char *name, int len); static int evt_name_to_basename(const char *name, char *base, int len); static int evt_name_to_device(const char *name, int *device); static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const char *evt_name, - char *description, int gpu_id ); + char *description, const char *chip_name ); static int retrieve_metric_rmr( NVPA_MetricsContext *pMetricsContext, const char *evt_name, int *numDep, NVPA_RawMetricRequest **pRMR ); @@ -164,6 +163,7 @@ static int get_event_collection_method(const char *evt_name); static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl); static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl); static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts); +static int num_unique_devs(int num_gpus); /* nvperf function pointers */ NVPA_Status ( *NVPW_GetSupportedChipNamesPtr ) (NVPW_GetSupportedChipNames_Params* params); @@ -1374,19 +1374,27 @@ static void free_all_enumerated_metrics(void) avail_events = NULL; } +/** @class init_main_htable + * @brief Initialize the main htable that will + * store the metric info for all devices.. +*/ static void init_main_htable(void) { - int htable_errno; + int htable_errno, i, val = 1, base = 2; + + /* get the total number of possible metrics to allocate for, + as of now we allow only 2^NAMEID_WIDTH metrics */ + for (i = 0; i < NAMEID_WIDTH; i++) { + val *= base; + } - /* capacity is set to 2097152 as this is - the maximum number of events we allow as of now */ cuptiu_table_p = papi_malloc(sizeof(cuptiu_event_table_t)); - cuptiu_table_p->capacity = 2097152; + cuptiu_table_p->capacity = val; cuptiu_table_p->count = 0; htable_init(&cuptiu_table_p->htable); - cuptiu_table_p->events = papi_calloc(2097152, sizeof(cuptiu_event_t)); + cuptiu_table_p->events = papi_calloc(val, sizeof(cuptiu_event_t)); } /** @class cuptip_init @@ -1415,7 +1423,7 @@ int cuptip_init(void) cuptic_disabled_reason_set("No GPUs found on system."); goto fn_fail; } - + /* initialize cupti profiler and perfworks api */ papi_errno = initialize_cupti_profiler_api(); papi_errno += initialize_perfworks_api(); @@ -1435,7 +1443,6 @@ int cuptip_init(void) goto fn_fail; } - /* initialize main hash table to store entries */ init_main_htable(); @@ -1590,7 +1597,7 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t goto fn_exit; } - /* multipass is not supporter; therefore, we must check the Cuda native event */ + /* multipass is not supported; therefore, we must check the Cuda native event */ papi_errno = check_multipass(state); if (papi_errno != PAPI_OK) { goto fn_exit; @@ -1629,7 +1636,6 @@ int cuptip_ctx_start(cuptip_control_t state) /* enumerate through all of the unique gpus */ for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { - printf("gpu_id is: %d\n", gpu_id); gpu_ctl = &(state->gpu_ctl[gpu_id]); if (gpu_ctl->event_names->count == 0) { continue; @@ -2145,7 +2151,7 @@ static int shutdown_event_table(void) * @param gpu_id * Device number, e.g. 0, 1, 2, ... ,etc. */ -static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const char *evt_name, char *description, int gpu_id) +static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const char *evt_name, char *description, const char *chip_name) { COMPDBG("Entering.\n"); int num_dep, i, len, passes, papi_errno; @@ -2216,7 +2222,7 @@ static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const ch .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE, .pPriv = NULL, // assign to NULL .activityKind = NVPA_ACTIVITY_KIND_PROFILER, - .pChipName = avail_events[gpu_id].chip_name, + .pChipName = chip_name, }; nvpa_err = NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams); if (nvpa_err != NVPA_STATUS_SUCCESS) { @@ -2541,9 +2547,11 @@ static int evt_code_to_name(uint64_t event_code, char *name, int len) int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) { - int papi_errno, len, gpu_id; + int papi_errno, len, i, gpu_id; event_info_t inf; char description[PAPI_HUGE_STR_LEN]; + + /* get event code info */ papi_errno = evt_id_to_info(event_code, &inf); if (papi_errno != PAPI_OK) { return papi_errno; @@ -2551,8 +2559,15 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) /* collect the description and calculated numpass for a specific nameid */ if (cuptiu_table_p->events[inf.nameid].desc[0] == 0) { - papi_errno = retrieve_metric_descr( avail_events[0].pmetricsContextCreateParams->pMetricsContext, - cuptiu_table_p->events[inf.nameid].name, cuptiu_table_p->events[inf.nameid].desc, 0 ); + for (i = 0; i < num_gpus; ++i) { + if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) { + gpu_id = i; + break; + } + } + papi_errno = retrieve_metric_descr( avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext, + cuptiu_table_p->events[inf.nameid].name, cuptiu_table_p->events[inf.nameid].desc, + avail_events[gpu_id].pmetricsContextCreateParams->pChipName ); if (papi_errno != PAPI_OK) { return papi_errno; } @@ -2569,7 +2584,6 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) break; case DEVICE_FLAG: { - int i; char devices[PAPI_MAX_STR_LEN] = { 0 }; for (i = 0; i < num_gpus; ++i) { if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) { From 41832798e58cea271402c0c26ba685a9c58a3339 Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Tue, 7 Jan 2025 03:17:46 +0000 Subject: [PATCH 03/16] Revert change in papi_cupti_common.c --- src/components/cuda/papi_cupti_common.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c index f378418af..f3aeb9e08 100644 --- a/src/components/cuda/papi_cupti_common.c +++ b/src/components/cuda/papi_cupti_common.c @@ -714,7 +714,6 @@ int cuptic_device_acquire(cuptiu_event_table_t *evt_table) return papi_errno; } if (bitmask & global_gpu_bitmask) { - printf("We fail inside here.\n"); return PAPI_ECNFLCT; } _papi_hwi_lock(_cuda_lock); From 2aabc4bbe375982064b3fb3b23124f9586871ae0 Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Tue, 7 Jan 2025 15:15:36 +0000 Subject: [PATCH 04/16] Code cleanup --- src/components/cuda/cupti_profiler.c | 75 ++++++++-------------------- 1 file changed, 21 insertions(+), 54 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index c72973c4b..a340ea499 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -825,6 +825,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pChipName = avail_events[gpu_ctl->gpu_id].chip_name, }; nvpwCheckErrors( NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams), goto fn_fail ); + printf("Made it past first nvpwCheckErrors.\n"); if( gpu_ctl->counterAvailabilityImage.data != NULL) { NVPW_RawMetricsConfig_SetCounterAvailability_Params setCounterAvailabilityParams = { @@ -834,6 +835,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pCounterAvailabilityImage = gpu_ctl->counterAvailabilityImage.data, }; nvpwCheckErrors( NVPW_RawMetricsConfig_SetCounterAvailabilityPtr(&setCounterAvailabilityParams), goto fn_fail ); + printf("Made it pass second nvpwCheckErrors.\n"); }; /* NOTE: maxPassCount is being set to 1 as a final safety net to limit metric collection to a single pass. @@ -847,6 +849,8 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .maxPassCount = 1, }; nvpwCheckErrors( NVPW_RawMetricsConfig_BeginPassGroupPtr(&beginPassGroupParams), goto fn_fail ); + printf("Made it pass third nvpwCheckErrors.\n"); + NVPW_RawMetricsConfig_AddMetrics_Params addMetricsParams = { .structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE, @@ -855,7 +859,13 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pRawMetricRequests = gpu_ctl->rmr, .numMetricRequests = gpu_ctl->rmr_count, }; - nvpwCheckErrors( NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams), goto fn_fail ); + //nvpwCheckErrors( NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams), goto fn_fail ); + NVPA_Status _status = NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams); + if (_status != NVPA_STATUS_SUCCESS) { + printf("Failed with status: %d\n", _status); + } + printf("Made it pass fourth nvpwCheckErrors.\n"); + NVPW_RawMetricsConfig_EndPassGroup_Params endPassGroupParams = { .structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE, @@ -863,6 +873,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig, }; nvpwCheckErrors( NVPW_RawMetricsConfig_EndPassGroupPtr(&endPassGroupParams), goto fn_fail ); + printf("Made it past fifth nvpwCheckErrors.\n"); NVPW_RawMetricsConfig_GenerateConfigImage_Params generateConfigImageParams = { .structSize = NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE, @@ -870,6 +881,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig, }; nvpwCheckErrors( NVPW_RawMetricsConfig_GenerateConfigImagePtr(&generateConfigImageParams), goto fn_fail ); + printf("Made it past sixth nvpwCheckErrors.\n"); NVPW_RawMetricsConfig_GetConfigImage_Params getConfigImageParams = { .structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE, @@ -879,6 +891,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pBuffer = NULL, }; nvpwCheckErrors( NVPW_RawMetricsConfig_GetConfigImagePtr(&getConfigImageParams), goto fn_fail ); + printf("Made it past seventh nvpwCheckErrors.\n"); gpu_ctl->configImage.size = getConfigImageParams.bytesCopied; gpu_ctl->configImage.data = (uint8_t *) papi_calloc(gpu_ctl->configImage.size, sizeof(uint8_t)); @@ -890,6 +903,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) getConfigImageParams.bytesAllocated = gpu_ctl->configImage.size; getConfigImageParams.pBuffer = gpu_ctl->configImage.data; nvpwCheckErrors( NVPW_RawMetricsConfig_GetConfigImagePtr(&getConfigImageParams), goto fn_fail ); + printf("Made it past eigth nvpwCheckErrors.\n"); NVPW_RawMetricsConfig_Destroy_Params rawMetricsConfigDestroyParams = { .structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE, @@ -897,6 +911,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig, }; nvpwCheckErrors( NVPW_RawMetricsConfig_DestroyPtr((NVPW_RawMetricsConfig_Destroy_Params *) &rawMetricsConfigDestroyParams), goto fn_fail ); + printf("Made it past ninth nvpwCheckErrors.\n"); return PAPI_OK; fn_fail: @@ -1454,51 +1469,6 @@ int cuptip_init(void) return PAPI_EMISC; } -/** @class verify_events - * @brief Verify user added events and create a subset table to be used for - * start, stop, etc. - * @param *events_id - * Cuda native event id's. - * @param num_events - * Number of Cuda native events a user is wanting to count. - * @param **targeted_event_names - * Event table to hold subset of user added events. -*/ -/* -int verify_events(uint64_t *events_id, int num_events, - cuptiu_event_table_t **targeted_event_names) -{ - int papi_errno = PAPI_OK, i; - char name[PAPI_MAX_STR_LEN] = { 0 }; - - papi_errno = cuptiu_event_table_create_init_capacity( - num_events * num_gpus, - sizeof(cuptiu_event_t), targeted_event_names - ); - if (papi_errno != PAPI_OK) { - goto fn_exit; - } - - for (i = 0; i < num_events; i++) { - event_info_t info; - papi_errno = evt_id_to_info(events_id[i], &info); - if (papi_errno != PAPI_OK) { - break; - } - sprintf(name, "%s", cuptiu_table_p->events[info.nameid].name); - strcpy((*targeted_event_names)->added_cuda_evts[i], name); - (*targeted_event_names)->added_cuda_dev[i] = info.device; - void *p; - if (htable_find(cuptiu_table_p->htable, name, (void **) &p) != HTABLE_SUCCESS) { - htable_insert((*targeted_event_names)->htable, name, (void **) &p ); - } - (*targeted_event_names)->count++; - } - - fn_exit: - return papi_errno; -} -*/ int verify_events(uint64_t *events_id, int num_events, cuptip_control_t state) { @@ -1554,11 +1524,6 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t char name[PAPI_2MAX_STR_LEN] = { 0 }; cuptiu_event_table_t *targeted_event_names; - //papi_errno = verify_events(events_id, num_events, &targeted_event_names); - //if (papi_errno != PAPI_OK) { - // return papi_errno; - //} - /* create a cuptip_control_t struct which contains read_count, running, cupti_info_t and cuptip_gpu_state_t */ cuptip_control_t state = (cuptip_control_t) papi_calloc (1, sizeof(struct cuptip_control_s)); if (state == NULL) { @@ -1660,8 +1625,11 @@ int cuptip_ctx_start(cuptip_control_t state) /* CUPTI profiler host configuration */ papi_errno = metric_get_config_image(gpu_ctl); + printf("papi_errno first: %d\n", papi_errno); papi_errno += metric_get_counter_data_prefix_image(gpu_ctl); + printf("papi_errno second: %d\n", papi_errno); papi_errno += create_counter_data_image(gpu_ctl); + printf("papi_errno third: %d\n", papi_errno); if (papi_errno != PAPI_OK) { printf("cupti profiler host configuration.\n"); ERRDBG("Failed to create CUPTI profiler state for gpu %d\n", gpu_id); @@ -2047,7 +2015,6 @@ int init_event_table(void) /* loop through metrics to add to overall event table */ for (i = 0; i < avail_events[table_idx].num_metrics; i++) { papi_errno = get_ntv_events( cuptiu_table_p, avail_events[table_idx].metric_names[i], i, 0, dev_id); - //papi_errno = get_ntv_events( avail_events[0].cuptiu_table_p, getMetricNameBeginParams.ppMetricNames[i], i, 0, dev_id); if (papi_errno != PAPI_OK) goto fn_exit; } @@ -2060,7 +2027,7 @@ int init_event_table(void) .structSize = NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE, .pPriv = NULL, .pMetricsContext = avail_events[gpu_idx].pmetricsContextCreateParams->pMetricsContext, - }; + }; nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_EndPtr((NVPW_MetricsContext_GetMetricNames_End_Params *) &getMetricNameEndParams), goto fn_fail ); } @@ -2089,7 +2056,6 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, int papi_errno; char description[256]; int *count = &evt_table->count; - //cuptiu_event_t *events = cuptiu_table.events; cuptiu_event_t *events = evt_table->events; /* check to see if evt_name argument has been provided */ @@ -2559,6 +2525,7 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) /* collect the description and calculated numpass for a specific nameid */ if (cuptiu_table_p->events[inf.nameid].desc[0] == 0) { + /* find a matching device id to get correct MetricsContext and chip name */ for (i = 0; i < num_gpus; ++i) { if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) { gpu_id = i; From 40516226d4956cb9d46b049e883a7249dddb2385 Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Fri, 17 Jan 2025 14:46:49 +0000 Subject: [PATCH 05/16] Updates to support multi-gpu for Cuda component. --- src/components/cuda/cupti_profiler.c | 194 +++++++++++++-------------- src/components/cuda/cupti_utils.h | 13 ++ 2 files changed, 104 insertions(+), 103 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index a340ea499..d29bb3340 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -47,9 +47,7 @@ typedef struct byte_array_s byte_array_t; typedef struct cuptip_gpu_state_s cuptip_gpu_state_t; -typedef struct list_metrics_s list_metrics_t; typedef struct NVPA_MetricsContext NVPA_MetricsContext; -typedef NVPW_CUDA_MetricsContext_Create_Params MCCP_t; typedef struct { int device; @@ -65,7 +63,7 @@ struct byte_array_s { struct cuptip_gpu_state_s { int gpu_id; - cuptiu_event_table_t *event_names; + cuptiu_event_table_t *targeted_events; int rmr_count; NVPA_RawMetricRequest *rmr; MCCP_t *pmetricsContextCreateParams; @@ -87,17 +85,9 @@ struct cuptip_control_s { cuptic_info_t info; }; -struct list_metrics_s { - char chip_name[32]; - MCCP_t *pmetricsContextCreateParams; - int num_metrics; - const char* const* metric_names; - cuptiu_event_table_t *cuptiu_table_p; -}; - static void *dl_nvpw; static int num_gpus; -static list_metrics_t *avail_events; +static gpu_record_t *avail_gpu_info; static cuptiu_event_table_t *cuptiu_table_p; @@ -116,8 +106,8 @@ static int initialize_perfworks_api(void); /* utility functions to init metrics and cuda native event table */ static int init_all_metrics(void); -static int init_event_table(void); static void init_main_htable(void); +static int init_event_table(void); static int shutdown_event_table(void); static void free_all_enumerated_metrics(void); @@ -146,7 +136,6 @@ static int calculate_num_passes(struct NVPA_RawMetricsConfig *pRawMetricsConfig, /* functions to set and get cuda native event info or convert cuda native events */ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos, int gpu_id); -//static int verify_events(uint64_t *events_id, int num_events, cuptiu_event_table_t **targeted_event_names); static int verify_events(uint64_t *events_id, int num_events, cuptip_control_t state); static int evt_id_to_info(uint64_t event_id, event_info_t *info); static int evt_id_create(event_info_t *info, uint64_t *event_id); @@ -160,7 +149,7 @@ static int retrieve_metric_rmr( NVPA_MetricsContext *pMetricsContext, const char /* misc */ static int get_event_collection_method(const char *evt_name); -static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl); +static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl); static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl); static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts); static int num_unique_devs(int num_gpus); @@ -470,7 +459,6 @@ static int initialize_perfworks_api(void) return PAPI_OK; } - static int get_chip_name(int dev_num, char* chipName) { int papi_errno; @@ -489,7 +477,7 @@ static int get_chip_name(int dev_num, char* chipName) return PAPI_OK; } -/** @class get_event_names_rmr +/** @class get_targeted_events_rmr * @brief For a Cuda native event name collect raw metrics and count * of raw metrics for collection. Raw Metrics are one layer of the Metric API * and contains the list of raw counters and generates configuration file @@ -499,7 +487,7 @@ static int get_chip_name(int dev_num, char* chipName) * Structure of type cuptip_gpu_state_t which has member variables such as * gpu_id, rmr, rmr_count, and more. */ -static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl) +static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl) { COMPDBG("Entering.\n"); int gpu_id, num_dep, count_raw_metrics = 0, papi_errno = PAPI_OK; @@ -508,11 +496,11 @@ static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl) cuptiu_event_t *evt_rec; /* for each event in the event table collect the raw metric requests */ - for (i = 0; i < gpu_ctl->event_names->count; i++) { + for (i = 0; i < gpu_ctl->targeted_events->count; i++) { /* Not using the correct global event names now.*/ papi_errno = retrieve_metric_rmr( gpu_ctl->pmetricsContextCreateParams->pMetricsContext, - gpu_ctl->event_names->added_cuda_evts[i], &num_dep, + gpu_ctl->targeted_events->added_cuda_evts[i], &num_dep, &collect_rmr ); /* why is PAPI_ENOEVNT hard coded? */ @@ -651,7 +639,7 @@ static int nvpw_cuda_metricscontext_create(cuptip_control_t state) /* setting metadata values */ pMCCP->structSize = NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE; - pMCCP->pChipName = avail_events[gpu_id].chip_name; + pMCCP->pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name; /* create context */ nvpa_err = NVPW_CUDA_MetricsContext_CreatePtr(pMCCP); @@ -718,11 +706,11 @@ static int check_multipass(cuptip_control_t state) for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->event_names->count == 0) { + if (gpu_ctl->targeted_events->count == 0) { continue; } - papi_errno = get_event_names_rmr(gpu_ctl); + papi_errno = get_targeted_events_rmr(gpu_ctl); if (papi_errno != PAPI_OK) { goto fn_exit; } @@ -732,7 +720,7 @@ static int check_multipass(cuptip_control_t state) .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE, .pPriv = NULL, .activityKind = NVPA_ACTIVITY_KIND_PROFILER, - .pChipName = avail_events[gpu_id].chip_name, + .pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name, }; nvpa_err = NVPW_CUDA_RawMetricsConfig_CreatePtr( &nvpw_metricsConfigCreateParams @@ -809,7 +797,7 @@ static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl) /** @class metric_get_config_image * @brief Retrieves binary ConfigImage for the Cuda native event metrics listed - * for collection. The function get_event_names_rmr( ... ) must be + * for collection. The function get_targeted_events_rmr( ... ) must be * called before this step is possible. * @param *gpu_ctl * Structure of type cuptip_gpu_state_t which has member variables such as @@ -822,7 +810,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE, .pPriv = NULL, .activityKind = NVPA_ACTIVITY_KIND_PROFILER, - .pChipName = avail_events[gpu_ctl->gpu_id].chip_name, + .pChipName = cuptiu_table_p->avail_gpu_info[gpu_ctl->gpu_id].chip_name, }; nvpwCheckErrors( NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams), goto fn_fail ); printf("Made it past first nvpwCheckErrors.\n"); @@ -920,7 +908,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) /** @class metric_get_counter_data_prefix_image * @brief Retrieves binary CounterDataPrefix for the Cuda native event metrics - * listed for collection. The function get_event_names_rmr( ... ) + * listed for collection. The function get_targeted_events_rmr( ... ) * must be called before this step is possible. * @param *gpu_ctl * Structure of type cuptip_gpu_state_t which has member variables such as @@ -932,7 +920,7 @@ static int metric_get_counter_data_prefix_image(cuptip_gpu_state_t *gpu_ctl) NVPW_CounterDataBuilder_Create_Params counterDataBuilderCreateParams = { .structSize = NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE, .pPriv = NULL, - .pChipName = avail_events[gpu_ctl->gpu_id].chip_name, + .pChipName = cuptiu_table_p->avail_gpu_info[gpu_ctl->gpu_id].chip_name, }; nvpwCheckErrors( NVPW_CounterDataBuilder_CreatePtr(&counterDataBuilderCreateParams), goto fn_fail ); @@ -1224,7 +1212,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts) { COMPDBG("eval_metric_values. dev = %d\n", gpu_ctl->gpu_id); int i, papi_errno = PAPI_OK; - int numMetrics = gpu_ctl->event_names->count; + int numMetrics = gpu_ctl->targeted_events->count; double *gpuValues; char **metricNames; @@ -1248,7 +1236,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts) } for (i = 0; i < numMetrics; i++) { - metricNames[i] = gpu_ctl->event_names->added_cuda_evts[i]; + metricNames[i] = gpu_ctl->targeted_events->added_cuda_evts[i]; LOGDBG("Setting metric name %s\n", metricNames[i]); } @@ -1275,7 +1263,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts) nvpwCheckErrors( NVPW_MetricsContext_EvaluateToGpuValuesPtr(&evalToGpuParams), goto fn_fail ); /* store the gpu values */ - for (i = 0; i < (int) gpu_ctl->event_names->count; i++) { + for (i = 0; i < (int) gpu_ctl->targeted_events->count; i++) { counts[i] = gpuValues[i]; } @@ -1299,14 +1287,13 @@ static int find_same_chipname(int gpu_id) { int i; for (i = 0; i < gpu_id; i++) { - if (!strcmp(avail_events[gpu_id].chip_name, avail_events[i].chip_name)) { + if (!strcmp(cuptiu_table_p->avail_gpu_info[gpu_id].chip_name, cuptiu_table_p->avail_gpu_info[i].chip_name)) { return i; } } return -1; } - /** @class init_all_metrics * @brief Initialize metrics for a specific GPU. * @@ -1314,13 +1301,13 @@ static int find_same_chipname(int gpu_id) static int init_all_metrics(void) { int gpu_id, papi_errno = PAPI_OK; - avail_events = (list_metrics_t *) papi_calloc(num_gpus, sizeof(list_metrics_t)); - if (avail_events == NULL) { + cuptiu_table_p->avail_gpu_info = (gpu_record_t *) papi_calloc(num_gpus, sizeof(gpu_record_t)); + if (cuptiu_table_p->avail_gpu_info == NULL) { papi_errno = PAPI_ENOMEM; goto fn_exit; } for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { - papi_errno = get_chip_name(gpu_id, avail_events[gpu_id].chip_name); + papi_errno = get_chip_name(gpu_id, cuptiu_table_p->avail_gpu_info[gpu_id].chip_name); if (papi_errno != PAPI_OK) { goto fn_exit; } @@ -1329,7 +1316,7 @@ static int init_all_metrics(void) for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { found = find_same_chipname(gpu_id); if (found > -1) { - avail_events[gpu_id].pmetricsContextCreateParams = avail_events[found].pmetricsContextCreateParams; + cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = cuptiu_table_p->avail_gpu_info[found].pmetricsContextCreateParams; continue; } MCCP_t *pMCCP = (MCCP_t *) papi_calloc(1, sizeof(MCCP_t)); @@ -1338,10 +1325,10 @@ static int init_all_metrics(void) goto fn_exit; } pMCCP->structSize = NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE; - pMCCP->pChipName = avail_events[gpu_id].chip_name; + pMCCP->pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name; nvpwCheckErrors( NVPW_CUDA_MetricsContext_CreatePtr(pMCCP), goto fn_fail ); - avail_events[gpu_id].pmetricsContextCreateParams = pMCCP; + cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = pMCCP; } fn_exit: @@ -1359,57 +1346,52 @@ static void free_all_enumerated_metrics(void) COMPDBG("Entering.\n"); int gpu_id, found; NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams; - if (avail_events == NULL) { + if (cuptiu_table_p->avail_gpu_info == NULL) { return; } for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { found = find_same_chipname(gpu_id); if (found > -1) { - avail_events[gpu_id].num_metrics = 0; - avail_events[gpu_id].cuptiu_table_p = NULL; - avail_events[gpu_id].pmetricsContextCreateParams = NULL; + cuptiu_table_p->avail_gpu_info[gpu_id].num_metrics = 0; + cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = NULL; continue; } - if (avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext) { + if (cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pMetricsContext) { metricsContextDestroyParams = (NVPW_MetricsContext_Destroy_Params) { .structSize = NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE, .pPriv = NULL, - .pMetricsContext = avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext, + .pMetricsContext = cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pMetricsContext, }; nvpwCheckErrors(NVPW_MetricsContext_DestroyPtr(&metricsContextDestroyParams), ); } - papi_free(avail_events[gpu_id].pmetricsContextCreateParams); - avail_events[gpu_id].pmetricsContextCreateParams = NULL; + papi_free(cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams); + cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = NULL; - if (avail_events[gpu_id].cuptiu_table_p) { - cuptiu_event_table_destroy( &(avail_events[gpu_id].cuptiu_table_p) ); - } } - papi_free(avail_events); - avail_events = NULL; + papi_free(cuptiu_table_p->avail_gpu_info); + cuptiu_table_p->avail_gpu_info = NULL; } /** @class init_main_htable - * @brief Initialize the main htable that will - * store the metric info for all devices.. + * @brief Initialize the main htable used to collect metrics. */ static void init_main_htable(void) { - int htable_errno, i, val = 1, base = 2; + int i, val = 1, base = 2; - /* get the total number of possible metrics to allocate for, - as of now we allow only 2^NAMEID_WIDTH metrics */ + /* allocate 2 ^ 21 metric names, this matches the number of bits for the event encoding format */ for (i = 0; i < NAMEID_WIDTH; i++) { val *= base; - } - + } + + /* allocate memory */ cuptiu_table_p = papi_malloc(sizeof(cuptiu_event_table_t)); - cuptiu_table_p->capacity = val; - cuptiu_table_p->count = 0; - - htable_init(&cuptiu_table_p->htable); - + cuptiu_table_p->capacity = val; + cuptiu_table_p->count = 0; cuptiu_table_p->events = papi_calloc(val, sizeof(cuptiu_event_t)); + + /* initialize the main hash table for metric collection */ + htable_init(&cuptiu_table_p->htable); } /** @class cuptip_init @@ -1447,6 +1429,9 @@ int cuptip_init(void) goto fn_fail; } + /* init htable and allocate memory */ + init_main_htable(); + papi_errno = init_all_metrics(); if (papi_errno != PAPI_OK) { goto fn_fail; @@ -1458,9 +1443,6 @@ int cuptip_init(void) goto fn_fail; } - /* initialize main hash table to store entries */ - init_main_htable(); - /* initialize hash table with cuda native events */ init_event_table(); @@ -1478,7 +1460,7 @@ int verify_events(uint64_t *events_id, int num_events, for (i = 0; i < num_gpus; i++) { papi_errno = cuptiu_event_table_create_init_capacity( num_events * num_gpus, - sizeof(cuptiu_event_t), &(state->gpu_ctl[i].event_names) + sizeof(cuptiu_event_t), &(state->gpu_ctl[i].targeted_events) ); if (papi_errno != PAPI_OK) { goto fn_exit; @@ -1492,13 +1474,13 @@ int verify_events(uint64_t *events_id, int num_events, break; } sprintf(name, "%s", cuptiu_table_p->events[info.nameid].name); - strcpy(state->gpu_ctl[info.device].event_names->added_cuda_evts[i], name); - state->gpu_ctl[info.device].event_names->added_cuda_dev[i] = info.device; + strcpy(state->gpu_ctl[info.device].targeted_events->added_cuda_evts[i], name); + state->gpu_ctl[info.device].targeted_events->added_cuda_dev[i] = info.device; void *p; if (htable_find(cuptiu_table_p->htable, name, (void **) &p) != HTABLE_SUCCESS) { - htable_insert(state->gpu_ctl[info.device].event_names->htable, name, (void **) &p ); + htable_insert(state->gpu_ctl[info.device].targeted_events->htable, name, (void **) &p ); } - state->gpu_ctl[info.device].event_names->count++; + state->gpu_ctl[info.device].targeted_events->count++; } fn_exit: @@ -1522,7 +1504,7 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t int papi_errno = PAPI_OK, gpu_id, i; long long *counters = NULL; char name[PAPI_2MAX_STR_LEN] = { 0 }; - cuptiu_event_table_t *targeted_event_names; + cuptiu_event_table_t *targeted_targeted_events; /* create a cuptip_control_t struct which contains read_count, running, cupti_info_t and cuptip_gpu_state_t */ cuptip_control_t state = (cuptip_control_t) papi_calloc (1, sizeof(struct cuptip_control_s)); @@ -1542,7 +1524,7 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t /* for each unique gpu store the gpu id for that gpu index */ for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { state->gpu_ctl[gpu_id].gpu_id = gpu_id; - //state->gpu_ctl[gpu_id].event_names = targeted_event_names; + //state->gpu_ctl[gpu_id].targeted_events = targeted_targeted_events; } /* register the user created cuda context for the current gpu if not already known */ @@ -1602,11 +1584,11 @@ int cuptip_ctx_start(cuptip_control_t state) /* enumerate through all of the unique gpus */ for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->event_names->count == 0) { + if (gpu_ctl->targeted_events->count == 0) { continue; } - LOGDBG("Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->event_names->count, gpu_ctl->rmr_count); - papi_errno = cuptic_device_acquire(state->gpu_ctl[gpu_id].event_names); + LOGDBG("Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->targeted_events->count, gpu_ctl->rmr_count); + papi_errno = cuptic_device_acquire(state->gpu_ctl[gpu_id].targeted_events); if (papi_errno != PAPI_OK) { ERRDBG("Profiling same gpu from multiple event sets not allowed.\n"); return papi_errno; @@ -1680,7 +1662,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->event_names->count == 0) { + if (gpu_ctl->targeted_events->count == 0) { continue; } @@ -1718,13 +1700,13 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) if (papi_errno != PAPI_OK) { goto fn_exit; } - for (i = 0; i < (int) gpu_ctl->event_names->count; i++) { + for (i = 0; i < (int) gpu_ctl->targeted_events->count; i++) { if (state->read_count == 0) { counter_vals[i] = counts[i]; } else { /* determine collection method such as max, min, sum, and avg for an added Cuda native event */ - method = get_event_collection_method(gpu_ctl->event_names->added_cuda_evts[i]); + method = get_event_collection_method(gpu_ctl->targeted_events->added_cuda_evts[i]); switch (method) { case CUDA_SUM: counter_vals[i] += counts[i]; @@ -1826,7 +1808,7 @@ int cuptip_ctx_stop(cuptip_control_t state) for (gpu_id=0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->event_names->count == 0) { + if (gpu_ctl->targeted_events->count == 0) { continue; } papi_errno = cuptic_ctxarr_get_ctx(state->info, gpu_id, &ctx); @@ -1835,7 +1817,7 @@ int cuptip_ctx_stop(cuptip_control_t state) if (papi_errno != PAPI_OK) { goto fn_fail; } - papi_errno = cuptic_device_release(state->gpu_ctl[gpu_id].event_names); + papi_errno = cuptic_device_release(state->gpu_ctl[gpu_id].targeted_events); if (papi_errno != PAPI_OK) { goto fn_fail; } @@ -1865,7 +1847,7 @@ int cuptip_ctx_destroy(cuptip_control_t *pstate) int papi_errno = nvpw_cuda_metricscontext_destroy(state); for (i = 0; i < num_gpus; i++) { reset_cupti_prof_config_images( &(state->gpu_ctl[i]) ); - cuptiu_event_table_destroy( &(state->gpu_ctl[i].event_names) ); + cuptiu_event_table_destroy( &(state->gpu_ctl[i].targeted_events) ); for (j = 0; j < state->gpu_ctl[i].rmr_count; j++) { papi_free((void *) state->gpu_ctl[i].rmr[j].pMetricName); } @@ -1955,9 +1937,11 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info) return PAPI_ENOEVNT; } + /* if (cuptiu_dev_check(cuptiu_table_p->events[info->nameid].device_map, info->device) == 0) { return PAPI_ENOEVNT; - } + } + */ if (info->nameid >= cuptiu_table_p->count) { return PAPI_ENOEVNT; @@ -1977,7 +1961,7 @@ int init_event_table(void) /* instatiate struct to collect the total metric count and metric names; instantiated here to avoid scoping issues */ NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = { NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE }; - + /* loop through all available devices on the current system */ for (dev_id = 0; dev_id < num_gpus; dev_id++) { found = find_same_chipname(dev_id); @@ -1989,7 +1973,7 @@ int init_event_table(void) /* assigning values to member variables */ getMetricNameBeginParams.pPriv = NULL; - getMetricNameBeginParams.pMetricsContext = avail_events[table_idx].pmetricsContextCreateParams->pMetricsContext; + getMetricNameBeginParams.pMetricsContext = cuptiu_table_p->avail_gpu_info[table_idx].pmetricsContextCreateParams->pMetricsContext; getMetricNameBeginParams.hidePeakSubMetrics = !listsubmetrics; getMetricNameBeginParams.hidePerCycleSubMetrics = !listsubmetrics; getMetricNameBeginParams.hidePctOfPeakSubMetrics = !listsubmetrics; @@ -1997,24 +1981,18 @@ int init_event_table(void) nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_BeginPtr(&getMetricNameBeginParams), goto fn_fail ); /* for each unique device found, store both the total number of metrics and metric names */ - avail_events[table_idx].num_metrics = getMetricNameBeginParams.numMetrics; - avail_events[table_idx].metric_names = getMetricNameBeginParams.ppMetricNames; - - papi_errno = cuptiu_event_table_create_init_capacity(avail_events[table_idx].num_metrics * num_gpus, sizeof(cuptiu_event_t), &(avail_events[table_idx].cuptiu_table_p)); - if (papi_errno != PAPI_OK) { - goto fn_exit; - } - avail_events[table_idx].cuptiu_table_p->events = papi_calloc(avail_events[table_idx].num_metrics, sizeof(cuptiu_event_t)); + cuptiu_table_p->avail_gpu_info[table_idx].num_metrics = getMetricNameBeginParams.numMetrics; + cuptiu_table_p->avail_gpu_info[table_idx].metric_names = getMetricNameBeginParams.ppMetricNames; } /* device metadata already collected, set table index */ else { /* set table_idx to */ table_idx = found; } - + /* loop through metrics to add to overall event table */ - for (i = 0; i < avail_events[table_idx].num_metrics; i++) { - papi_errno = get_ntv_events( cuptiu_table_p, avail_events[table_idx].metric_names[i], i, 0, dev_id); + for (i = 0; i < cuptiu_table_p->avail_gpu_info[table_idx].num_metrics; i++) { + papi_errno = get_ntv_events( cuptiu_table_p, cuptiu_table_p->avail_gpu_info[table_idx].metric_names[i], i, 0, dev_id); if (papi_errno != PAPI_OK) goto fn_exit; } @@ -2026,7 +2004,7 @@ int init_event_table(void) NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = { .structSize = NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE, .pPriv = NULL, - .pMetricsContext = avail_events[gpu_idx].pmetricsContextCreateParams->pMetricsContext, + .pMetricsContext = cuptiu_table_p->avail_gpu_info[gpu_idx].pmetricsContextCreateParams->pMetricsContext, }; nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_EndPtr((NVPW_MetricsContext_GetMetricNames_End_Params *) &getMetricNameEndParams), goto fn_fail ); } @@ -2084,7 +2062,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, return PAPI_ESYS; } } - + cuptiu_dev_set(&event->device_map, gpu_id); return PAPI_OK; @@ -2333,6 +2311,7 @@ int cuptip_evt_enum(uint64_t *event_code, int modifier) case PAPI_ENUM_EVENTS: papi_errno = evt_id_to_info(*event_code, &info); if (papi_errno != PAPI_OK) { + printf("We fail enum_events evt_id_to_info.\n"); break; } if (cuptiu_table_p->count > info.nameid + 1) { @@ -2340,6 +2319,9 @@ int cuptip_evt_enum(uint64_t *event_code, int modifier) info.flags = 0; info.nameid++; papi_errno = evt_id_create(&info, event_code); + if (papi_errno != PAPI_OK) { + printf("Failed to create id.\n"); + } break; } papi_errno = PAPI_END; @@ -2524,7 +2506,7 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) } /* collect the description and calculated numpass for a specific nameid */ - if (cuptiu_table_p->events[inf.nameid].desc[0] == 0) { + if (cuptiu_table_p->events[inf.nameid].desc[0] == '\0') { /* find a matching device id to get correct MetricsContext and chip name */ for (i = 0; i < num_gpus; ++i) { if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) { @@ -2532,9 +2514,9 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) break; } } - papi_errno = retrieve_metric_descr( avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext, + papi_errno = retrieve_metric_descr( cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pMetricsContext, cuptiu_table_p->events[inf.nameid].name, cuptiu_table_p->events[inf.nameid].desc, - avail_events[gpu_id].pmetricsContextCreateParams->pChipName ); + cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pChipName ); if (papi_errno != PAPI_OK) { return papi_errno; } @@ -2551,16 +2533,22 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) break; case DEVICE_FLAG: { + int init_metric_dev_id; char devices[PAPI_MAX_STR_LEN] = { 0 }; for (i = 0; i < num_gpus; ++i) { if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) { + /* for a metric, store the first device found to use with :device=# */ + if (devices[0] == '\0') { + init_metric_dev_id = i; + } + sprintf(devices + strlen(devices), "%i,", i); } } *(devices + strlen(devices) - 1) = 0; /* cuda native event name */ - snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s:device=%i", cuptiu_table_p->events[inf.nameid].name, inf.device ); + snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s:device=%i", cuptiu_table_p->events[inf.nameid].name, init_metric_dev_id ); /* cuda native event short description */ snprintf( info->short_descr, PAPI_MIN_STR_LEN, "%s masks:Mandatory device qualifier [%s]", cuptiu_table_p->events[inf.nameid].desc, devices ); diff --git a/src/components/cuda/cupti_utils.h b/src/components/cuda/cupti_utils.h index b79dd5130..9031ce97f 100644 --- a/src/components/cuda/cupti_utils.h +++ b/src/components/cuda/cupti_utils.h @@ -9,10 +9,15 @@ #define __CUPTI_UTILS_H__ #include + +#include + #include + typedef int64_t cuptiu_bitmap_t; typedef int (*cuptiu_dev_get_map_cb)(uint64_t event_id, int *dev_id); +typedef NVPW_CUDA_MetricsContext_Create_Params MCCP_t; typedef struct event_record_s { char name[PAPI_2MAX_STR_LEN]; @@ -20,11 +25,19 @@ typedef struct event_record_s { cuptiu_bitmap_t device_map; } cuptiu_event_t; +typedef struct gpu_record_s { + char chip_name[32]; + MCCP_t *pmetricsContextCreateParams; + int num_metrics; + const char* const* metric_names; +} gpu_record_t; + typedef struct event_table_s { unsigned int count; unsigned int capacity; char added_cuda_evts[30][PAPI_2MAX_STR_LEN]; int added_cuda_dev[30]; + gpu_record_t *avail_gpu_info; cuptiu_event_t *events; void *htable; } cuptiu_event_table_t; From 21f78f263f550bfe8dc7c3915e63dcb25d8265d8 Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Thu, 23 Jan 2025 19:49:56 +0000 Subject: [PATCH 06/16] More changes to cuda workflow to allow for multiple gpus. --- src/components/cuda/cupti_profiler.c | 103 +++++++++++---------------- src/components/cuda/cupti_utils.h | 4 +- 2 files changed, 43 insertions(+), 64 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index d29bb3340..fce17b41f 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -497,13 +497,11 @@ static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl) /* for each event in the event table collect the raw metric requests */ for (i = 0; i < gpu_ctl->targeted_events->count; i++) { - /* Not using the correct global event names now.*/ papi_errno = retrieve_metric_rmr( gpu_ctl->pmetricsContextCreateParams->pMetricsContext, gpu_ctl->targeted_events->added_cuda_evts[i], &num_dep, &collect_rmr ); - /* why is PAPI_ENOEVNT hard coded? */ if (papi_errno != PAPI_OK) { papi_errno = PAPI_ENOEVNT; goto fn_exit; @@ -813,7 +811,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pChipName = cuptiu_table_p->avail_gpu_info[gpu_ctl->gpu_id].chip_name, }; nvpwCheckErrors( NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams), goto fn_fail ); - printf("Made it past first nvpwCheckErrors.\n"); if( gpu_ctl->counterAvailabilityImage.data != NULL) { NVPW_RawMetricsConfig_SetCounterAvailability_Params setCounterAvailabilityParams = { @@ -823,7 +820,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pCounterAvailabilityImage = gpu_ctl->counterAvailabilityImage.data, }; nvpwCheckErrors( NVPW_RawMetricsConfig_SetCounterAvailabilityPtr(&setCounterAvailabilityParams), goto fn_fail ); - printf("Made it pass second nvpwCheckErrors.\n"); }; /* NOTE: maxPassCount is being set to 1 as a final safety net to limit metric collection to a single pass. @@ -837,8 +833,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .maxPassCount = 1, }; nvpwCheckErrors( NVPW_RawMetricsConfig_BeginPassGroupPtr(&beginPassGroupParams), goto fn_fail ); - printf("Made it pass third nvpwCheckErrors.\n"); - NVPW_RawMetricsConfig_AddMetrics_Params addMetricsParams = { .structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE, @@ -847,13 +841,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pRawMetricRequests = gpu_ctl->rmr, .numMetricRequests = gpu_ctl->rmr_count, }; - //nvpwCheckErrors( NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams), goto fn_fail ); - NVPA_Status _status = NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams); - if (_status != NVPA_STATUS_SUCCESS) { - printf("Failed with status: %d\n", _status); - } - printf("Made it pass fourth nvpwCheckErrors.\n"); - + nvpwCheckErrors( NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams), goto fn_fail ); NVPW_RawMetricsConfig_EndPassGroup_Params endPassGroupParams = { .structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE, @@ -861,7 +849,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig, }; nvpwCheckErrors( NVPW_RawMetricsConfig_EndPassGroupPtr(&endPassGroupParams), goto fn_fail ); - printf("Made it past fifth nvpwCheckErrors.\n"); NVPW_RawMetricsConfig_GenerateConfigImage_Params generateConfigImageParams = { .structSize = NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE, @@ -869,7 +856,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig, }; nvpwCheckErrors( NVPW_RawMetricsConfig_GenerateConfigImagePtr(&generateConfigImageParams), goto fn_fail ); - printf("Made it past sixth nvpwCheckErrors.\n"); NVPW_RawMetricsConfig_GetConfigImage_Params getConfigImageParams = { .structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE, @@ -879,7 +865,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pBuffer = NULL, }; nvpwCheckErrors( NVPW_RawMetricsConfig_GetConfigImagePtr(&getConfigImageParams), goto fn_fail ); - printf("Made it past seventh nvpwCheckErrors.\n"); gpu_ctl->configImage.size = getConfigImageParams.bytesCopied; gpu_ctl->configImage.data = (uint8_t *) papi_calloc(gpu_ctl->configImage.size, sizeof(uint8_t)); @@ -891,7 +876,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) getConfigImageParams.bytesAllocated = gpu_ctl->configImage.size; getConfigImageParams.pBuffer = gpu_ctl->configImage.data; nvpwCheckErrors( NVPW_RawMetricsConfig_GetConfigImagePtr(&getConfigImageParams), goto fn_fail ); - printf("Made it past eigth nvpwCheckErrors.\n"); NVPW_RawMetricsConfig_Destroy_Params rawMetricsConfigDestroyParams = { .structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE, @@ -899,7 +883,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig, }; nvpwCheckErrors( NVPW_RawMetricsConfig_DestroyPtr((NVPW_RawMetricsConfig_Destroy_Params *) &rawMetricsConfigDestroyParams), goto fn_fail ); - printf("Made it past ninth nvpwCheckErrors.\n"); return PAPI_OK; fn_fail: @@ -1123,7 +1106,7 @@ static int begin_profiling(cuptip_gpu_state_t *gpu_ctl) }; cuptiCheckErrors( cuptiProfilerEnableProfilingPtr(&enableProfilingParams), goto fn_fail ); - char rangeName[64]; + char rangeName[PAPI_MIN_STR_LEN]; sprintf(rangeName, "PAPI_Range_%d", gpu_ctl->gpu_id); CUpti_Profiler_PushRange_Params pushRangeParams = { .structSize = CUpti_Profiler_PushRange_Params_STRUCT_SIZE, @@ -1451,19 +1434,30 @@ int cuptip_init(void) return PAPI_EMISC; } +/** @class verify_events + * @brief Verify user added events and store metadata i.e. metric names + * and device id's . + * @param *events_id + * Cuda native event id's. + * @param num_events + * Number of Cuda native events a user is wanting to count. + * @param state + * Struct that holds read count, running, cuptip_info_t, and + * cuptip_gpu_state_t. +*/ int verify_events(uint64_t *events_id, int num_events, cuptip_control_t state) { - int papi_errno = PAPI_OK, i; - char name[PAPI_MAX_STR_LEN] = { 0 }; + int papi_errno, i; + char *metricName; for (i = 0; i < num_gpus; i++) { papi_errno = cuptiu_event_table_create_init_capacity( - num_events * num_gpus, + num_events, sizeof(cuptiu_event_t), &(state->gpu_ctl[i].targeted_events) ); if (papi_errno != PAPI_OK) { - goto fn_exit; + return papi_errno; } } @@ -1472,19 +1466,21 @@ int verify_events(uint64_t *events_id, int num_events, papi_errno = evt_id_to_info(events_id[i], &info); if (papi_errno != PAPI_OK) { break; - } - sprintf(name, "%s", cuptiu_table_p->events[info.nameid].name); - strcpy(state->gpu_ctl[info.device].targeted_events->added_cuda_evts[i], name); - state->gpu_ctl[info.device].targeted_events->added_cuda_dev[i] = info.device; + } + + /* store metadata i.e. metric names and device id's */ + metricName = state->gpu_ctl[info.device].targeted_events->added_cuda_evts[i]; + snprintf(metricName, PAPI_MAX_STR_LEN, "%s", cuptiu_table_p->events[info.nameid].name); + + state->gpu_ctl[info.device].targeted_events->added_cuda_dev[i] = info.device; + void *p; - if (htable_find(cuptiu_table_p->htable, name, (void **) &p) != HTABLE_SUCCESS) { - htable_insert(state->gpu_ctl[info.device].targeted_events->htable, name, (void **) &p ); + if (htable_find(cuptiu_table_p->htable, metricName, (void **) &p) != HTABLE_SUCCESS) { + return PAPI_ENOEVNT; } state->gpu_ctl[info.device].targeted_events->count++; - } - - fn_exit: - return papi_errno; + } + return PAPI_OK; } /** @class cuptip_ctx_create @@ -1607,20 +1603,15 @@ int cuptip_ctx_start(cuptip_control_t state) /* CUPTI profiler host configuration */ papi_errno = metric_get_config_image(gpu_ctl); - printf("papi_errno first: %d\n", papi_errno); papi_errno += metric_get_counter_data_prefix_image(gpu_ctl); - printf("papi_errno second: %d\n", papi_errno); papi_errno += create_counter_data_image(gpu_ctl); - printf("papi_errno third: %d\n", papi_errno); if (papi_errno != PAPI_OK) { - printf("cupti profiler host configuration.\n"); ERRDBG("Failed to create CUPTI profiler state for gpu %d\n", gpu_id); goto fn_fail; } papi_errno = begin_profiling(gpu_ctl); if (papi_errno != PAPI_OK) { - printf("begin profiling failed.\n"); ERRDBG("Failed to start profiling for gpu %d\n", gpu_id); goto fn_fail; } @@ -1700,7 +1691,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) if (papi_errno != PAPI_OK) { goto fn_exit; } - for (i = 0; i < (int) gpu_ctl->targeted_events->count; i++) { + for (i = 0; i < gpu_ctl->targeted_events->count; i++) { if (state->read_count == 0) { counter_vals[i] = counts[i]; } @@ -1909,9 +1900,11 @@ int cuptip_shutdown(void) */ int evt_id_create(event_info_t *info, uint64_t *event_id) { + *event_id = (uint64_t)(info->device << DEVICE_SHIFT); *event_id |= (uint64_t)(info->flags << QLMASK_SHIFT); *event_id |= (uint64_t)(info->nameid << NAMEID_SHIFT); + return PAPI_OK; } @@ -1937,12 +1930,6 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info) return PAPI_ENOEVNT; } - /* - if (cuptiu_dev_check(cuptiu_table_p->events[info->nameid].device_map, info->device) == 0) { - return PAPI_ENOEVNT; - } - */ - if (info->nameid >= cuptiu_table_p->count) { return PAPI_ENOEVNT; } @@ -2043,8 +2030,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, /* check to see if capacity has been correctly allocated */ if (evt_table->count >= evt_table->capacity) { - printf("Table count is larger than allocated capacity.\n"); - return PAPI_ENOMEM; + return PAPI_EBUG; } cuptiu_event_t *event; @@ -2311,7 +2297,6 @@ int cuptip_evt_enum(uint64_t *event_code, int modifier) case PAPI_ENUM_EVENTS: papi_errno = evt_id_to_info(*event_code, &info); if (papi_errno != PAPI_OK) { - printf("We fail enum_events evt_id_to_info.\n"); break; } if (cuptiu_table_p->count > info.nameid + 1) { @@ -2319,9 +2304,6 @@ int cuptip_evt_enum(uint64_t *event_code, int modifier) info.flags = 0; info.nameid++; papi_errno = evt_id_create(&info, event_code); - if (papi_errno != PAPI_OK) { - printf("Failed to create id.\n"); - } break; } papi_errno = PAPI_END; @@ -2495,17 +2477,17 @@ static int evt_code_to_name(uint64_t event_code, char *name, int len) int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) { - int papi_errno, len, i, gpu_id; - event_info_t inf; + int papi_errno, i, gpu_id; char description[PAPI_HUGE_STR_LEN]; - /* get event code info */ + /* get the events nameid and flags */ + event_info_t inf; papi_errno = evt_id_to_info(event_code, &inf); if (papi_errno != PAPI_OK) { return papi_errno; } - /* collect the description and calculated numpass for a specific nameid */ + /* collect the description and calculated numpass for the Cuda event */ if (cuptiu_table_p->events[inf.nameid].desc[0] == '\0') { /* find a matching device id to get correct MetricsContext and chip name */ for (i = 0; i < num_gpus; ++i) { @@ -2524,11 +2506,9 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) switch (inf.flags) { case (0): - /* cuda native event name */ + /* store details for the Cuda event */ snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s", cuptiu_table_p->events[inf.nameid].name ); - /* cuda native event short description */ snprintf( info->short_descr, PAPI_MIN_STR_LEN, "%s", cuptiu_table_p->events[inf.nameid].desc ); - /* cuda native event long description */ snprintf( info->long_descr, PAPI_HUGE_STR_LEN, "%s", cuptiu_table_p->events[inf.nameid].desc ); break; case DEVICE_FLAG: @@ -2537,7 +2517,8 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) char devices[PAPI_MAX_STR_LEN] = { 0 }; for (i = 0; i < num_gpus; ++i) { if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) { - /* for a metric, store the first device found to use with :device=# */ + /* for an event, store the first device found to use with :device=#, + as on a heterogenous system events may not appear on each device */ if (devices[0] == '\0') { init_metric_dev_id = i; } @@ -2547,12 +2528,10 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) } *(devices + strlen(devices) - 1) = 0; - /* cuda native event name */ + /* store details for the Cuda event */ snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s:device=%i", cuptiu_table_p->events[inf.nameid].name, init_metric_dev_id ); - /* cuda native event short description */ snprintf( info->short_descr, PAPI_MIN_STR_LEN, "%s masks:Mandatory device qualifier [%s]", cuptiu_table_p->events[inf.nameid].desc, devices ); - /* cuda native event long description */ snprintf( info->long_descr, PAPI_HUGE_STR_LEN, "%s masks:Mandatory device qualifier [%s]", cuptiu_table_p->events[inf.nameid].desc, devices ); break; diff --git a/src/components/cuda/cupti_utils.h b/src/components/cuda/cupti_utils.h index 9031ce97f..1c8c8d1a9 100644 --- a/src/components/cuda/cupti_utils.h +++ b/src/components/cuda/cupti_utils.h @@ -26,14 +26,14 @@ typedef struct event_record_s { } cuptiu_event_t; typedef struct gpu_record_s { - char chip_name[32]; + char chip_name[PAPI_MIN_STR_LEN]; MCCP_t *pmetricsContextCreateParams; int num_metrics; const char* const* metric_names; } gpu_record_t; typedef struct event_table_s { - unsigned int count; + int count; unsigned int capacity; char added_cuda_evts[30][PAPI_2MAX_STR_LEN]; int added_cuda_dev[30]; From bb7e9aad51261491eaeceabd293023bfdb68efad Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Sun, 2 Feb 2025 00:45:17 +0000 Subject: [PATCH 07/16] Pushing up to test on Voltar at oregon --- src/components/cuda/cupti_profiler.c | 133 +++++++++++++----------- src/components/cuda/cupti_utils.h | 5 +- src/components/cuda/linux-cuda.c | 25 +++-- src/components/cuda/papi_cupti_common.c | 12 ++- 4 files changed, 103 insertions(+), 72 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index fce17b41f..873c80f65 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -20,6 +20,7 @@ #include "cupti_config.h" #include "lcuda_debug.h" #include "htable.h" +#include /** * Event identifier encoding format: @@ -63,7 +64,7 @@ struct byte_array_s { struct cuptip_gpu_state_s { int gpu_id; - cuptiu_event_table_t *targeted_events; + cuptiu_event_table_t *added_events; int rmr_count; NVPA_RawMetricRequest *rmr; MCCP_t *pmetricsContextCreateParams; @@ -91,6 +92,7 @@ static gpu_record_t *avail_gpu_info; static cuptiu_event_table_t *cuptiu_table_p; + /* load and unload cuda function pointers */ static int load_cupti_perf_sym(void); static int unload_cupti_perf_sym(void); @@ -149,7 +151,7 @@ static int retrieve_metric_rmr( NVPA_MetricsContext *pMetricsContext, const char /* misc */ static int get_event_collection_method(const char *evt_name); -static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl); +static int get_added_events_rmr(cuptip_gpu_state_t *gpu_ctl); static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl); static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts); static int num_unique_devs(int num_gpus); @@ -477,7 +479,7 @@ static int get_chip_name(int dev_num, char* chipName) return PAPI_OK; } -/** @class get_targeted_events_rmr +/** @class get_added_events_rmr * @brief For a Cuda native event name collect raw metrics and count * of raw metrics for collection. Raw Metrics are one layer of the Metric API * and contains the list of raw counters and generates configuration file @@ -487,7 +489,7 @@ static int get_chip_name(int dev_num, char* chipName) * Structure of type cuptip_gpu_state_t which has member variables such as * gpu_id, rmr, rmr_count, and more. */ -static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl) +static int get_added_events_rmr(cuptip_gpu_state_t *gpu_ctl) { COMPDBG("Entering.\n"); int gpu_id, num_dep, count_raw_metrics = 0, papi_errno = PAPI_OK; @@ -496,10 +498,10 @@ static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl) cuptiu_event_t *evt_rec; /* for each event in the event table collect the raw metric requests */ - for (i = 0; i < gpu_ctl->targeted_events->count; i++) { + for (i = 0; i < gpu_ctl->added_events->count; i++) { papi_errno = retrieve_metric_rmr( gpu_ctl->pmetricsContextCreateParams->pMetricsContext, - gpu_ctl->targeted_events->added_cuda_evts[i], &num_dep, + gpu_ctl->added_events->cuda_evts[i], &num_dep, &collect_rmr ); if (papi_errno != PAPI_OK) { @@ -704,11 +706,11 @@ static int check_multipass(cuptip_control_t state) for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->targeted_events->count == 0) { + if (gpu_ctl->added_events->count == 0) { continue; } - papi_errno = get_targeted_events_rmr(gpu_ctl); + papi_errno = get_added_events_rmr(gpu_ctl); if (papi_errno != PAPI_OK) { goto fn_exit; } @@ -795,7 +797,7 @@ static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl) /** @class metric_get_config_image * @brief Retrieves binary ConfigImage for the Cuda native event metrics listed - * for collection. The function get_targeted_events_rmr( ... ) must be + * for collection. The function get_added_events_rmr( ... ) must be * called before this step is possible. * @param *gpu_ctl * Structure of type cuptip_gpu_state_t which has member variables such as @@ -804,11 +806,13 @@ static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl) static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) { COMPDBG("Entering.\n"); + int gpu_id = gpu_ctl->gpu_id; + NVPW_CUDA_RawMetricsConfig_Create_Params nvpw_metricsConfigCreateParams = { .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE, .pPriv = NULL, .activityKind = NVPA_ACTIVITY_KIND_PROFILER, - .pChipName = cuptiu_table_p->avail_gpu_info[gpu_ctl->gpu_id].chip_name, + .pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name, }; nvpwCheckErrors( NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams), goto fn_fail ); @@ -820,7 +824,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pCounterAvailabilityImage = gpu_ctl->counterAvailabilityImage.data, }; nvpwCheckErrors( NVPW_RawMetricsConfig_SetCounterAvailabilityPtr(&setCounterAvailabilityParams), goto fn_fail ); - }; + } /* NOTE: maxPassCount is being set to 1 as a final safety net to limit metric collection to a single pass. Metrics that require multiple passes would fail further down at AddMetrics due to this. @@ -891,7 +895,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) /** @class metric_get_counter_data_prefix_image * @brief Retrieves binary CounterDataPrefix for the Cuda native event metrics - * listed for collection. The function get_targeted_events_rmr( ... ) + * listed for collection. The function get_added_events_rmr( ... ) * must be called before this step is possible. * @param *gpu_ctl * Structure of type cuptip_gpu_state_t which has member variables such as @@ -1195,7 +1199,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts) { COMPDBG("eval_metric_values. dev = %d\n", gpu_ctl->gpu_id); int i, papi_errno = PAPI_OK; - int numMetrics = gpu_ctl->targeted_events->count; + int numMetrics = gpu_ctl->added_events->count; double *gpuValues; char **metricNames; @@ -1219,7 +1223,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts) } for (i = 0; i < numMetrics; i++) { - metricNames[i] = gpu_ctl->targeted_events->added_cuda_evts[i]; + metricNames[i] = gpu_ctl->added_events->cuda_evts[i]; LOGDBG("Setting metric name %s\n", metricNames[i]); } @@ -1246,7 +1250,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts) nvpwCheckErrors( NVPW_MetricsContext_EvaluateToGpuValuesPtr(&evalToGpuParams), goto fn_fail ); /* store the gpu values */ - for (i = 0; i < (int) gpu_ctl->targeted_events->count; i++) { + for (i = 0; i < (int) gpu_ctl->added_events->count; i++) { counts[i] = gpuValues[i]; } @@ -1398,7 +1402,6 @@ int cuptip_init(void) goto fn_fail; } - /* if no gpu's are found exit */ if (num_gpus <= 0) { cuptic_disabled_reason_set("No GPUs found on system."); goto fn_fail; @@ -1412,13 +1415,15 @@ int cuptip_init(void) goto fn_fail; } - /* init htable and allocate memory */ + /* initialize the main event table for metric collection */ init_main_htable(); papi_errno = init_all_metrics(); if (papi_errno != PAPI_OK) { goto fn_fail; } + /* initialize hash table with cuda native events */ + init_event_table(); papi_errno = cuInitPtr(0); if (papi_errno != CUDA_SUCCESS) { @@ -1426,9 +1431,6 @@ int cuptip_init(void) goto fn_fail; } - /* initialize hash table with cuda native events */ - init_event_table(); - return PAPI_OK; fn_fail: return PAPI_EMISC; @@ -1450,11 +1452,12 @@ int verify_events(uint64_t *events_id, int num_events, { int papi_errno, i; char *metricName; + int idx; - for (i = 0; i < num_gpus; i++) { + for (i = 0; i < num_gpus; i++) { papi_errno = cuptiu_event_table_create_init_capacity( num_events, - sizeof(cuptiu_event_t), &(state->gpu_ctl[i].targeted_events) + sizeof(cuptiu_event_t), &(state->gpu_ctl[i].added_events) ); if (papi_errno != PAPI_OK) { return papi_errno; @@ -1465,21 +1468,24 @@ int verify_events(uint64_t *events_id, int num_events, event_info_t info; papi_errno = evt_id_to_info(events_id[i], &info); if (papi_errno != PAPI_OK) { - break; + return papi_errno; } - - /* store metadata i.e. metric names and device id's */ - metricName = state->gpu_ctl[info.device].targeted_events->added_cuda_evts[i]; - snprintf(metricName, PAPI_MAX_STR_LEN, "%s", cuptiu_table_p->events[info.nameid].name); + + /* for the current device table get the next event index */ + idx = state->gpu_ctl[info.device].added_events->count; - state->gpu_ctl[info.device].targeted_events->added_cuda_dev[i] = info.device; + metricName = state->gpu_ctl[info.device].added_events->cuda_evts[idx]; + snprintf(metricName, PAPI_MAX_STR_LEN, "%s", cuptiu_table_p->events[info.nameid].name); void *p; if (htable_find(cuptiu_table_p->htable, metricName, (void **) &p) != HTABLE_SUCCESS) { return PAPI_ENOEVNT; - } - state->gpu_ctl[info.device].targeted_events->count++; + } + state->gpu_ctl[info.device].added_events->cuda_devs[idx] = info.device; + state->gpu_ctl[info.device].added_events->evt_pos[idx] = i; + state->gpu_ctl[info.device].added_events->count++; /* total number of events added for a specific device */ } + return PAPI_OK; } @@ -1500,27 +1506,25 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t int papi_errno = PAPI_OK, gpu_id, i; long long *counters = NULL; char name[PAPI_2MAX_STR_LEN] = { 0 }; - cuptiu_event_table_t *targeted_targeted_events; - /* create a cuptip_control_t struct which contains read_count, running, cupti_info_t and cuptip_gpu_state_t */ cuptip_control_t state = (cuptip_control_t) papi_calloc (1, sizeof(struct cuptip_control_s)); if (state == NULL) { return PAPI_ENOMEM; } - /* allocate memory for the total number of gpus for the cuptip_gpu_state_t struct - with the device qualifier refactor we only want to count the total number of unique gpus */ state->gpu_ctl = (cuptip_gpu_state_t *) papi_calloc(num_gpus, sizeof(cuptip_gpu_state_t)); if (state->gpu_ctl == NULL) { return PAPI_ENOMEM; } counters = papi_malloc(num_events * sizeof(*counters)); + if (counters == NULL) { + return PAPI_ENOMEM; + } /* for each unique gpu store the gpu id for that gpu index */ for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { state->gpu_ctl[gpu_id].gpu_id = gpu_id; - //state->gpu_ctl[gpu_id].targeted_events = targeted_targeted_events; } /* register the user created cuda context for the current gpu if not already known */ @@ -1529,18 +1533,19 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t goto fn_exit; } - /* creates a pMetricsContext */ + /* creates a MetricsContext */ papi_errno = nvpw_cuda_metricscontext_create(state); if (papi_errno != PAPI_OK) { goto fn_exit; } + /* verify user added events are available on the system */ papi_errno = verify_events(events_id, num_events, state); if (papi_errno != PAPI_OK) { goto fn_exit; } - /* multipass is not supported; therefore, we must check the Cuda native event */ + /* check to make sure added events do not require multiple passes */ papi_errno = check_multipass(state); if (papi_errno != PAPI_OK) { goto fn_exit; @@ -1580,16 +1585,16 @@ int cuptip_ctx_start(cuptip_control_t state) /* enumerate through all of the unique gpus */ for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->targeted_events->count == 0) { + if (gpu_ctl->added_events->count == 0) { continue; } - LOGDBG("Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->targeted_events->count, gpu_ctl->rmr_count); - papi_errno = cuptic_device_acquire(state->gpu_ctl[gpu_id].targeted_events); + LOGDBG("Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->added_events->count, gpu_ctl->rmr_count); + papi_errno = cuptic_device_acquire(state->gpu_ctl[gpu_id].added_events); if (papi_errno != PAPI_OK) { ERRDBG("Profiling same gpu from multiple event sets not allowed.\n"); return papi_errno; } - /* get the cuda context for the unique gpu */ + /* get the cuda context */ papi_errno = cuptic_ctxarr_get_ctx(state->info, gpu_id, &ctx); /* bind the specified CUDA context to the calling CPU thread */ cudaCheckErrors( cuCtxSetCurrentPtr(ctx), goto fn_fail_misc ); @@ -1640,8 +1645,9 @@ int cuptip_ctx_start(cuptip_control_t state) int cuptip_ctx_read(cuptip_control_t state, long long **counters) { COMPDBG("Entering.\n"); - int papi_errno, gpu_id, i, j = 0, method; - long long counts[30], *counter_vals = state->counters; + int papi_errno, gpu_id, i, j = 0, method, evt_pos; + long long counts[30]; + long long *counter_vals = state->counters; cuptip_gpu_state_t *gpu_ctl = NULL; CUcontext userCtx = NULL, ctx = NULL; @@ -1650,10 +1656,9 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc ); cudaArtCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); } - for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->targeted_events->count == 0) { + if (gpu_ctl->added_events->count == 0) { continue; } @@ -1691,22 +1696,27 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) if (papi_errno != PAPI_OK) { goto fn_exit; } - for (i = 0; i < gpu_ctl->targeted_events->count; i++) { + + for (i = 0; i < gpu_ctl->added_events->count; i++) { + printf("counts value: %d and gpu_id: %d\n", counts[i], gpu_id); + printf("evt_pos: %d\n", gpu_ctl->added_events->evt_pos[i]); + evt_pos = gpu_ctl->added_events->evt_pos[i]; if (state->read_count == 0) { - counter_vals[i] = counts[i]; + counter_vals[evt_pos] = counts[i]; } else { + printf("WE ENTER ELSE STATEMENT.\n"); /* determine collection method such as max, min, sum, and avg for an added Cuda native event */ - method = get_event_collection_method(gpu_ctl->targeted_events->added_cuda_evts[i]); + method = get_event_collection_method(gpu_ctl->added_events->cuda_evts[i]); switch (method) { case CUDA_SUM: - counter_vals[i] += counts[i]; + counter_vals[evt_pos] += counts[i]; break; case CUDA_MIN: - counter_vals[i] = counter_vals[i] < counts[i] ? counter_vals[i] : counts[i]; + counter_vals[evt_pos] = counter_vals[evt_pos] < counts[i] ? counter_vals[evt_pos] : counts[i]; break; case CUDA_MAX: - counter_vals[i] = counter_vals[i] > counts[i] ? counter_vals[i] : counts[i]; + counter_vals[evt_pos] = counter_vals[evt_pos] > counts[i] ? counter_vals[evt_pos] : counts[i]; break; case CUDA_AVG: /* (size * average + value) / (size + 1) @@ -1714,15 +1724,15 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) average - current average value - number to add to the average */ - counter_vals[i] = (state->read_count * counter_vals[j++] + counts[i]) / (state->read_count + 1); + counter_vals[evt_pos] = (state->read_count * counter_vals[j++] + counts[i]) / (state->read_count + 1); break; default: - counter_vals[i] = counts[i]; + counter_vals[evt_pos] = counts[i]; break; } } } - *counters = state->counters; + *counters = counter_vals; cuptiCheckErrors( cuptiProfilerCounterDataImageInitializePtr(&gpu_ctl->initializeParams), goto fn_fail_misc ); cuptiCheckErrors( cuptiProfilerCounterDataImageInitializeScratchBufferPtr(&gpu_ctl->initScratchBufferParams), goto fn_fail_misc ); @@ -1734,7 +1744,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) }; cuptiCheckErrors( cuptiProfilerBeginPassPtr(&beginPassParams), goto fn_fail_misc ); - char rangeName[64]; + char rangeName[PAPI_MIN_STR_LEN]; sprintf(rangeName, "PAPI_Range_%d", gpu_ctl->gpu_id); CUpti_Profiler_PushRange_Params pushRangeParams = { .structSize = CUpti_Profiler_PushRange_Params_STRUCT_SIZE, @@ -1799,7 +1809,7 @@ int cuptip_ctx_stop(cuptip_control_t state) for (gpu_id=0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->targeted_events->count == 0) { + if (gpu_ctl->added_events->count == 0) { continue; } papi_errno = cuptic_ctxarr_get_ctx(state->info, gpu_id, &ctx); @@ -1808,7 +1818,7 @@ int cuptip_ctx_stop(cuptip_control_t state) if (papi_errno != PAPI_OK) { goto fn_fail; } - papi_errno = cuptic_device_release(state->gpu_ctl[gpu_id].targeted_events); + papi_errno = cuptic_device_release(state->gpu_ctl[gpu_id].added_events); if (papi_errno != PAPI_OK) { goto fn_fail; } @@ -1838,7 +1848,7 @@ int cuptip_ctx_destroy(cuptip_control_t *pstate) int papi_errno = nvpw_cuda_metricscontext_destroy(state); for (i = 0; i < num_gpus; i++) { reset_cupti_prof_config_images( &(state->gpu_ctl[i]) ); - cuptiu_event_table_destroy( &(state->gpu_ctl[i].targeted_events) ); + cuptiu_event_table_destroy( &(state->gpu_ctl[i].added_events) ); for (j = 0; j < state->gpu_ctl[i].rmr_count; j++) { papi_free((void *) state->gpu_ctl[i].rmr[j].pMetricName); } @@ -2029,7 +2039,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, } /* check to see if capacity has been correctly allocated */ - if (evt_table->count >= evt_table->capacity) { + if (*count >= evt_table->capacity) { return PAPI_EBUG; } @@ -2048,7 +2058,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, return PAPI_ESYS; } } - + cuptiu_dev_set(&event->device_map, gpu_id); return PAPI_OK; @@ -2366,7 +2376,6 @@ int cuptip_evt_code_to_descr(uint64_t event_code, char *descr, int len) */ int cuptip_evt_name_to_code(const char *name, uint64_t *event_code) { - int htable_errno, device, flags, nameid, papi_errno = PAPI_OK; cuptiu_event_t *event; char base[PAPI_MAX_STR_LEN] = { 0 }; @@ -2421,7 +2430,7 @@ int cuptip_evt_name_to_code(const char *name, uint64_t *event_code) * Maximum alloted characters for base Cuda native event name. */ int cuptip_evt_code_to_name(uint64_t event_code, char *name, int len) -{ +{ return evt_code_to_name(event_code, name, len); } diff --git a/src/components/cuda/cupti_utils.h b/src/components/cuda/cupti_utils.h index 1c8c8d1a9..17c186111 100644 --- a/src/components/cuda/cupti_utils.h +++ b/src/components/cuda/cupti_utils.h @@ -35,8 +35,9 @@ typedef struct gpu_record_s { typedef struct event_table_s { int count; unsigned int capacity; - char added_cuda_evts[30][PAPI_2MAX_STR_LEN]; - int added_cuda_dev[30]; + char cuda_evts[30][PAPI_2MAX_STR_LEN]; + int cuda_devs[30]; + int evt_pos[30]; gpu_record_t *avail_gpu_info; cuptiu_event_t *events; void *htable; diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c index 37e173864..2a6f8485c 100644 --- a/src/components/cuda/linux-cuda.c +++ b/src/components/cuda/linux-cuda.c @@ -171,6 +171,8 @@ static int cuda_init_private(void) _papi_hwi_lock(COMPONENT_LOCK); SUBDBG("ENTER\n"); + if (_cuda_vector.cmp_info.initialized) goto fn_exit; + SUBDBG("Proceeding\n"); papi_errno = cuptid_init(); if (papi_errno != PAPI_OK) { @@ -201,10 +203,12 @@ static int cuda_init_private(void) static int check_n_initialize(void) { + + //_papi_hwi_lock(COMPONENT_LOCK); if (!_cuda_vector.cmp_info.initialized) { return cuda_init_private(); } - + //_papi_hwi_unlock(COMPONENT_LOCK); return _cuda_vector.cmp_info.disabled; } @@ -215,9 +219,11 @@ static int cuda_ntv_enum_events(unsigned int *event_code, int modifier) if (papi_errno != PAPI_OK) { goto fn_exit; } - + uint64_t code = *(uint64_t *) event_code; + //_papi_hwi_lock(COMPONENT_LOCK); papi_errno = cuptid_evt_enum(&code, modifier); + //_papi_hwi_unlock(COMPONENT_LOCK); *event_code = (unsigned int) code; fn_exit: @@ -233,9 +239,11 @@ static int cuda_ntv_name_to_code(const char *name, unsigned int *event_code) if (papi_errno != PAPI_OK) { goto fn_exit; } - + uint64_t code; + //_papi_hwi_lock(COMPONENT_LOCK); papi_errno = cuptid_evt_name_to_code(name, &code); + //_papi_hwi_unlock(COMPONENT_LOCK); *event_code = (unsigned int) code; fn_exit: @@ -269,7 +277,9 @@ static int cuda_ntv_code_to_descr(unsigned int event_code, char *descr, int len) goto fn_fail; } + //_papi_hwi_lock(COMPONENT_LOCK); papi_errno = cuptid_evt_code_to_descr((uint64_t) event_code, descr, len); + //_papi_hwi_unlock(COMPONENT_LOCK); fn_exit: SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno)); @@ -286,7 +296,7 @@ static int cuda_ntv_code_to_info(unsigned int event_code, PAPI_event_info_t *inf goto fn_fail; } - papi_errno = cuptid_evt_code_to_info((uint64_t) event_code, info); + papi_errno = cuptid_evt_code_to_info((uint64_t) event_code, info); fn_exit: SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno)); @@ -317,7 +327,8 @@ static int cuda_shutdown_thread(hwd_context_t *ctx) static int cuda_init_control_state(hwd_control_state_t __attribute__((unused)) *ctl) { COMPDBG("Entering.\n"); - return PAPI_OK; + //return PAPI_OK; + return check_n_initialize(); } static int cuda_set_domain(hwd_control_state_t __attribute__((unused)) *ctrl, int domain) @@ -360,7 +371,7 @@ static int cuda_update_control_state(hwd_control_state_t *ctl, NativeInfo_t *ntv if (papi_errno != PAPI_OK) { goto fn_exit; } - + printf("ntv_count: %d\n", ntv_count); /* needed to make sure multipass events are caught with proper error code (PAPI_EMULPASS)*/ papi_errno = cuptid_ctx_create(cuda_ctl->info, &(cuda_ctl->cuptid_ctx), cuda_ctl->events_id, cuda_ctl->num_events); fn_exit: @@ -403,7 +414,7 @@ int update_native_events(cuda_control_t *ctl, NativeInfo_t *ntv_info, sorted_events[i].frontend_idx = i; } - qsort(sorted_events, ntv_count, sizeof(struct event_map_item), compare); + //qsort(sorted_events, ntv_count, sizeof(struct event_map_item), compare); for (i = 0; i < ntv_count; ++i) { ctl->events_id[i] = sorted_events[i].event_id; diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c index f3aeb9e08..ca4b953d6 100644 --- a/src/components/cuda/papi_cupti_common.c +++ b/src/components/cuda/papi_cupti_common.c @@ -619,6 +619,8 @@ int cuptic_ctxarr_update_current(cuptic_info_t info) return PAPI_EMISC; } + printf("gpu_id inside update_current is: %d\n", gpu_id); + /* return cuda context bound to the calling CPU thread */ cuda_err = cuCtxGetCurrentPtr(&pctx); if (cuda_err != cudaSuccess) { @@ -636,6 +638,7 @@ int cuptic_ctxarr_update_current(cuptic_info_t info) } /* cuda context not found for calling CPU thread */ else { + printf("We create a cuda context.\n"); cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC); cudaCheckErrors(cuCtxGetCurrentPtr(&info[gpu_id].ctx), return PAPI_EMISC); LOGDBG("Using primary device context %p for device %d.\n", info[gpu_id].ctx, gpu_id); @@ -654,6 +657,13 @@ int cuptic_ctxarr_update_current(cuptic_info_t info) int cuptic_ctxarr_get_ctx(cuptic_info_t info, int gpu_idx, CUcontext *ctx) { *ctx = info[gpu_idx].ctx; + printf("gpu_idx: %d\n", gpu_idx); + if (*ctx == NULL) { + printf("ctx is null.\n"); + *ctx = info[0].ctx; + if (*ctx != NULL) printf("Gpu id 0 is not null.\n"); + } + return PAPI_OK; } @@ -699,7 +709,7 @@ static int _devmask_events_get(cuptiu_event_table_t *evt_table, gpu_occupancy_t gpu_occupancy_t acq_mask = 0; cuptiu_event_t *evt_rec; for (i = 0; i < evt_table->count; i++) { - acq_mask |= (1 << evt_table->added_cuda_dev[i]); + acq_mask |= (1 << evt_table->cuda_devs[i]); } *bitmask = acq_mask; fn_exit: From d54558d17c615f8b5c9260e4c1a6162b3bbb559c Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Wed, 5 Feb 2025 13:15:21 +0000 Subject: [PATCH 08/16] Support for heterogeneous gpu configurations final. --- src/components/cuda/cupti_profiler.c | 37 ++++++++++++------------- src/components/cuda/linux-cuda.c | 18 ++---------- src/components/cuda/papi_cupti_common.c | 10 ------- 3 files changed, 20 insertions(+), 45 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index 873c80f65..b754b5c4a 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -20,7 +20,6 @@ #include "cupti_config.h" #include "lcuda_debug.h" #include "htable.h" -#include /** * Event identifier encoding format: @@ -90,9 +89,9 @@ static void *dl_nvpw; static int num_gpus; static gpu_record_t *avail_gpu_info; +/* main event table to store metrics */ static cuptiu_event_table_t *cuptiu_table_p; - /* load and unload cuda function pointers */ static int load_cupti_perf_sym(void); static int unload_cupti_perf_sym(void); @@ -154,7 +153,6 @@ static int get_event_collection_method(const char *evt_name); static int get_added_events_rmr(cuptip_gpu_state_t *gpu_ctl); static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl); static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts); -static int num_unique_devs(int num_gpus); /* nvperf function pointers */ NVPA_Status ( *NVPW_GetSupportedChipNamesPtr ) (NVPW_GetSupportedChipNames_Params* params); @@ -904,10 +902,12 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) static int metric_get_counter_data_prefix_image(cuptip_gpu_state_t *gpu_ctl) { COMPDBG("Entering.\n"); + int gpu_id = gpu_ctl->gpu_id; + NVPW_CounterDataBuilder_Create_Params counterDataBuilderCreateParams = { .structSize = NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE, .pPriv = NULL, - .pChipName = cuptiu_table_p->avail_gpu_info[gpu_ctl->gpu_id].chip_name, + .pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name, }; nvpwCheckErrors( NVPW_CounterDataBuilder_CreatePtr(&counterDataBuilderCreateParams), goto fn_fail ); @@ -1111,7 +1111,8 @@ static int begin_profiling(cuptip_gpu_state_t *gpu_ctl) cuptiCheckErrors( cuptiProfilerEnableProfilingPtr(&enableProfilingParams), goto fn_fail ); char rangeName[PAPI_MIN_STR_LEN]; - sprintf(rangeName, "PAPI_Range_%d", gpu_ctl->gpu_id); + int gpu_id = gpu_ctl->gpu_id; + sprintf(rangeName, "PAPI_Range_%d", gpu_id); CUpti_Profiler_PushRange_Params pushRangeParams = { .structSize = CUpti_Profiler_PushRange_Params_STRUCT_SIZE, .pPriv = NULL, @@ -1366,12 +1367,13 @@ static void init_main_htable(void) { int i, val = 1, base = 2; - /* allocate 2 ^ 21 metric names, this matches the number of bits for the event encoding format */ + /* allocate (2 ^ NAMEID_WIDTH) metric names, this matches the + number of bits for the event encoding format */ for (i = 0; i < NAMEID_WIDTH; i++) { val *= base; } - /* allocate memory */ + /* initialize struct */ cuptiu_table_p = papi_malloc(sizeof(cuptiu_event_table_t)); cuptiu_table_p->capacity = val; cuptiu_table_p->count = 0; @@ -1415,14 +1417,14 @@ int cuptip_init(void) goto fn_fail; } - /* initialize the main event table for metric collection */ init_main_htable(); papi_errno = init_all_metrics(); if (papi_errno != PAPI_OK) { goto fn_fail; } - /* initialize hash table with cuda native events */ + + /* collect metrics */ init_event_table(); papi_errno = cuInitPtr(0); @@ -1471,7 +1473,7 @@ int verify_events(uint64_t *events_id, int num_events, return papi_errno; } - /* for the current device table get the next event index */ + /* for a specific device table, get the current event index */ idx = state->gpu_ctl[info.device].added_events->count; metricName = state->gpu_ctl[info.device].added_events->cuda_evts[idx]; @@ -1522,7 +1524,6 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t return PAPI_ENOMEM; } - /* for each unique gpu store the gpu id for that gpu index */ for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { state->gpu_ctl[gpu_id].gpu_id = gpu_id; } @@ -1533,13 +1534,13 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t goto fn_exit; } - /* creates a MetricsContext */ + /* create a MetricsContext */ papi_errno = nvpw_cuda_metricscontext_create(state); if (papi_errno != PAPI_OK) { goto fn_exit; } - /* verify user added events are available on the system */ + /* verify user added events are available on the machine */ papi_errno = verify_events(events_id, num_events, state); if (papi_errno != PAPI_OK) { goto fn_exit; @@ -1646,8 +1647,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) { COMPDBG("Entering.\n"); int papi_errno, gpu_id, i, j = 0, method, evt_pos; - long long counts[30]; - long long *counter_vals = state->counters; + long long counts[30], *counter_vals = state->counters; cuptip_gpu_state_t *gpu_ctl = NULL; CUcontext userCtx = NULL, ctx = NULL; @@ -1698,14 +1698,11 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) } for (i = 0; i < gpu_ctl->added_events->count; i++) { - printf("counts value: %d and gpu_id: %d\n", counts[i], gpu_id); - printf("evt_pos: %d\n", gpu_ctl->added_events->evt_pos[i]); evt_pos = gpu_ctl->added_events->evt_pos[i]; if (state->read_count == 0) { counter_vals[evt_pos] = counts[i]; } else { - printf("WE ENTER ELSE STATEMENT.\n"); /* determine collection method such as max, min, sum, and avg for an added Cuda native event */ method = get_event_collection_method(gpu_ctl->added_events->cuda_evts[i]); switch (method) { @@ -2031,7 +2028,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, int papi_errno; char description[256]; int *count = &evt_table->count; - cuptiu_event_t *events = evt_table->events; + cuptiu_event_t *events = evt_table->events; /* check to see if evt_name argument has been provided */ if (evt_name == NULL) { @@ -2058,7 +2055,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, return PAPI_ESYS; } } - + cuptiu_dev_set(&event->device_map, gpu_id); return PAPI_OK; diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c index 2a6f8485c..22c508fd0 100644 --- a/src/components/cuda/linux-cuda.c +++ b/src/components/cuda/linux-cuda.c @@ -143,10 +143,9 @@ static int cuda_init_component(int cidx) _cuda_vector.cmp_info.num_native_events = -1; _cuda_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx; - //_cuda_vector.cmp_info.initialized = 1; _cuda_vector.cmp_info.disabled = PAPI_EDELAY_INIT; sprintf(_cuda_vector.cmp_info.disabled_reason, - "Not initialized. Access component events to initialize it."); + "Not initialized. Access component events to initialize it."); return PAPI_EDELAY_INIT; } @@ -172,7 +171,6 @@ static int cuda_init_private(void) _papi_hwi_lock(COMPONENT_LOCK); SUBDBG("ENTER\n"); if (_cuda_vector.cmp_info.initialized) goto fn_exit; - SUBDBG("Proceeding\n"); papi_errno = cuptid_init(); if (papi_errno != PAPI_OK) { @@ -203,12 +201,9 @@ static int cuda_init_private(void) static int check_n_initialize(void) { - - //_papi_hwi_lock(COMPONENT_LOCK); if (!_cuda_vector.cmp_info.initialized) { return cuda_init_private(); } - //_papi_hwi_unlock(COMPONENT_LOCK); return _cuda_vector.cmp_info.disabled; } @@ -221,9 +216,7 @@ static int cuda_ntv_enum_events(unsigned int *event_code, int modifier) } uint64_t code = *(uint64_t *) event_code; - //_papi_hwi_lock(COMPONENT_LOCK); papi_errno = cuptid_evt_enum(&code, modifier); - //_papi_hwi_unlock(COMPONENT_LOCK); *event_code = (unsigned int) code; fn_exit: @@ -241,9 +234,7 @@ static int cuda_ntv_name_to_code(const char *name, unsigned int *event_code) } uint64_t code; - //_papi_hwi_lock(COMPONENT_LOCK); papi_errno = cuptid_evt_name_to_code(name, &code); - //_papi_hwi_unlock(COMPONENT_LOCK); *event_code = (unsigned int) code; fn_exit: @@ -277,9 +268,7 @@ static int cuda_ntv_code_to_descr(unsigned int event_code, char *descr, int len) goto fn_fail; } - //_papi_hwi_lock(COMPONENT_LOCK); papi_errno = cuptid_evt_code_to_descr((uint64_t) event_code, descr, len); - //_papi_hwi_unlock(COMPONENT_LOCK); fn_exit: SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno)); @@ -371,9 +360,10 @@ static int cuda_update_control_state(hwd_control_state_t *ctl, NativeInfo_t *ntv if (papi_errno != PAPI_OK) { goto fn_exit; } - printf("ntv_count: %d\n", ntv_count); + /* needed to make sure multipass events are caught with proper error code (PAPI_EMULPASS)*/ papi_errno = cuptid_ctx_create(cuda_ctl->info, &(cuda_ctl->cuptid_ctx), cuda_ctl->events_id, cuda_ctl->num_events); + fn_exit: SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno)); return papi_errno; @@ -414,8 +404,6 @@ int update_native_events(cuda_control_t *ctl, NativeInfo_t *ntv_info, sorted_events[i].frontend_idx = i; } - //qsort(sorted_events, ntv_count, sizeof(struct event_map_item), compare); - for (i = 0; i < ntv_count; ++i) { ctl->events_id[i] = sorted_events[i].event_id; ntv_info[sorted_events[i].frontend_idx].ni_position = i; diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c index ca4b953d6..492eb260f 100644 --- a/src/components/cuda/papi_cupti_common.c +++ b/src/components/cuda/papi_cupti_common.c @@ -619,8 +619,6 @@ int cuptic_ctxarr_update_current(cuptic_info_t info) return PAPI_EMISC; } - printf("gpu_id inside update_current is: %d\n", gpu_id); - /* return cuda context bound to the calling CPU thread */ cuda_err = cuCtxGetCurrentPtr(&pctx); if (cuda_err != cudaSuccess) { @@ -638,7 +636,6 @@ int cuptic_ctxarr_update_current(cuptic_info_t info) } /* cuda context not found for calling CPU thread */ else { - printf("We create a cuda context.\n"); cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC); cudaCheckErrors(cuCtxGetCurrentPtr(&info[gpu_id].ctx), return PAPI_EMISC); LOGDBG("Using primary device context %p for device %d.\n", info[gpu_id].ctx, gpu_id); @@ -657,13 +654,6 @@ int cuptic_ctxarr_update_current(cuptic_info_t info) int cuptic_ctxarr_get_ctx(cuptic_info_t info, int gpu_idx, CUcontext *ctx) { *ctx = info[gpu_idx].ctx; - printf("gpu_idx: %d\n", gpu_idx); - if (*ctx == NULL) { - printf("ctx is null.\n"); - *ctx = info[0].ctx; - if (*ctx != NULL) printf("Gpu id 0 is not null.\n"); - } - return PAPI_OK; } From b3a87f402a4c59c3db04b9e9830640aba9974b53 Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Tue, 11 Feb 2025 03:06:36 +0000 Subject: [PATCH 09/16] Update the function cuptic_ctxarr_update_current to work with papi_command_line with various device id's appened to :device=#. --- src/components/cuda/cupti_profiler.c | 8 +++- src/components/cuda/papi_cupti_common.c | 60 ++++++++++++------------- src/components/cuda/papi_cupti_common.h | 2 +- 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index b754b5c4a..363bbdea6 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -1528,8 +1528,14 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t state->gpu_ctl[gpu_id].gpu_id = gpu_id; } + event_info_t info; + papi_errno = evt_id_to_info(events_id[num_events - 1], &info); + if (papi_errno != PAPI_OK) { + return papi_errno; + } + /* register the user created cuda context for the current gpu if not already known */ - papi_errno = cuptic_ctxarr_update_current(thr_info); + papi_errno = cuptic_ctxarr_update_current(thr_info, info.device); if (papi_errno != PAPI_OK) { goto fn_exit; } diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c index 492eb260f..71d001bdd 100644 --- a/src/components/cuda/papi_cupti_common.c +++ b/src/components/cuda/papi_cupti_common.c @@ -607,46 +607,44 @@ int cuptic_ctxarr_create(cuptic_info_t *pinfo) * Struct that contains a Cuda context, that can be indexed into based * on device id. */ -int cuptic_ctxarr_update_current(cuptic_info_t info) +int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id) { int gpu_id; CUcontext pctx; CUresult cuda_err; + CUdevice dev_id; - /* get device currently being used */ - cuda_err = cudaGetDevicePtr(&gpu_id); - if (cuda_err != cudaSuccess) { - return PAPI_EMISC; - } - - /* return cuda context bound to the calling CPU thread */ + // See if a user created a CUDA context on the + // calling cpu thread. cuda_err = cuCtxGetCurrentPtr(&pctx); - if (cuda_err != cudaSuccess) { - return PAPI_EMISC; - } - /* check to see if Cuda context exists for device */ - if (info[gpu_id].ctx == NULL) { - /* cuda context found for the calling CPU thread */ - if (pctx != NULL) { - LOGDBG("Registering device = %d with ctx = %p.\n", gpu_id, pctx); - /* store current context into struct */ - cuda_err = cuCtxGetCurrentPtr(&info[gpu_id].ctx); - if (cuda_err != cudaSuccess) + if (cuda_err == CUDA_SUCCESS && pctx != NULL) { + // Get the device id associated with the user created CUDA context + cuda_err = cuCtxGetDevicePtr(&dev_id); + if (cuda_err != CUDA_SUCCESS) { + return PAPI_EMISC; + } + + if (info[dev_id].ctx == NULL) { + // Store current user created CUDA context + cuda_err = cuCtxGetCurrentPtr(&info[dev_id].ctx); + if (cuda_err != CUDA_SUCCESS) { return PAPI_EMISC; + } } - /* cuda context not found for calling CPU thread */ - else { - cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC); - cudaCheckErrors(cuCtxGetCurrentPtr(&info[gpu_id].ctx), return PAPI_EMISC); - LOGDBG("Using primary device context %p for device %d.\n", info[gpu_id].ctx, gpu_id); + else if (info[dev_id].ctx != pctx) { + ERRDBG("Warning: cuda context for gpu %d has changed from %p to %p\n", gpu_id, info[gpu_id].ctx, pctx); } - } - - /* if context exists then see if it has changed; if it has then keep the first - seen one, but show warning */ - else if (info[gpu_id].ctx != pctx) { - ERRDBG("Warning: cuda context for gpu %d has changed from %p to %p\n", gpu_id, info[gpu_id].ctx, pctx); - } + } + // If a user did not create a CUDA context, then we will create one + // for them. Note, that for machine with multiple devices, we need to + // call cudaSetDevice. + else { + cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC); + cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC); + + cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC); + cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC); + } return PAPI_OK; } diff --git a/src/components/cuda/papi_cupti_common.h b/src/components/cuda/papi_cupti_common.h index 13a30828f..398d75267 100644 --- a/src/components/cuda/papi_cupti_common.h +++ b/src/components/cuda/papi_cupti_common.h @@ -65,7 +65,7 @@ int cuptic_shutdown(void); /* context management interfaces */ int cuptic_ctxarr_create(cuptic_info_t *pinfo); -int cuptic_ctxarr_update_current(cuptic_info_t info); +int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id); int cuptic_ctxarr_get_ctx(cuptic_info_t info, int gpu_idx, CUcontext *ctx); int cuptic_ctxarr_destroy(cuptic_info_t *pinfo); From e355daab784df0546a5f5073e638ee9d35761a02 Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Wed, 12 Feb 2025 18:01:28 +0000 Subject: [PATCH 10/16] Update function to append default device qualifier. --- src/components/cuda/cupti_profiler.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index 363bbdea6..c567a9ac6 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -2583,8 +2583,9 @@ static int evt_name_to_basename(const char *name, char *base, int len) } /** @class evt_name_to_device - * @brief Take a Cuda native event name with a device qualifer appended to - * it and collect the device number. + * @brief Return the device number for a user provided Cuda native event. + * This can be done with a device qualifier present (:device=#) or + * we internally find the first device the native event exists for. * @param *name * Cuda native event name with a device qualifier appended. * @param *device @@ -2593,11 +2594,27 @@ static int evt_name_to_basename(const char *name, char *base, int len) static int evt_name_to_device(const char *name, int *device) { char *p = strstr(name, ":device="); + // User did provide :device=# qualifier if (p) { *device = (int) strtol(p + strlen(":device="), NULL, 10); } + // User did not provide :device=# qualifier else { - *device = 0; + int i, htable_errno; + cuptiu_event_t *event; + + htable_errno = htable_find(cuptiu_table_p->htable, name, (void **) &event); + if (htable_errno != HTABLE_SUCCESS) { + return PAPI_EINVAL; + } + + // Search for the first device the event exists for. + for (i = 0; i < num_gpus; ++i) { + if (cuptiu_dev_check(event->device_map, i)) { + *device = i; + break; + } + } } return PAPI_OK; } From 1bdd85bec6bbf149cbd1fb761e82b16a20c441fc Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Wed, 12 Feb 2025 19:01:35 +0000 Subject: [PATCH 11/16] Fix indexing in init_event_table to free allocated memory. --- src/components/cuda/cupti_profiler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index c567a9ac6..44a626681 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -2004,7 +2004,7 @@ int init_event_table(void) NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = { .structSize = NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE, .pPriv = NULL, - .pMetricsContext = cuptiu_table_p->avail_gpu_info[gpu_idx].pmetricsContextCreateParams->pMetricsContext, + .pMetricsContext = cuptiu_table_p->avail_gpu_info[table_idx].pmetricsContextCreateParams->pMetricsContext, }; nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_EndPtr((NVPW_MetricsContext_GetMetricNames_End_Params *) &getMetricNameEndParams), goto fn_fail ); } From cb47152c8c2db8728bae1fc1b9c10de6f80cac1e Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Fri, 14 Feb 2025 13:46:46 +0000 Subject: [PATCH 12/16] Add guard against a user PopCurrent and additional check for snprintf. --- src/components/cuda/linux-cuda.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c index 22c508fd0..4b1cf549c 100644 --- a/src/components/cuda/linux-cuda.c +++ b/src/components/cuda/linux-cuda.c @@ -170,14 +170,18 @@ static int cuda_init_private(void) _papi_hwi_lock(COMPONENT_LOCK); SUBDBG("ENTER\n"); - if (_cuda_vector.cmp_info.initialized) goto fn_exit; + + if (_cuda_vector.cmp_info.initialized) { + SUBDBG("Skipping cuda_init_private, as the Cuda event table has already been initialized.\n"); + goto fn_exit; + } papi_errno = cuptid_init(); if (papi_errno != PAPI_OK) { /* get and assign the string literal for the disabled reason */ cuptid_disabled_reason_get(&disabled_reason); len = snprintf(_cuda_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", disabled_reason); - if (len > PAPI_MAX_STR_LEN) { + if (len < 0 || len > PAPI_MAX_STR_LEN) { SUBDBG("The disabled reason has been truncated.\n"); } goto fn_fail; From fe2a676864924af1a62482f83acf456b237e7f4d Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Fri, 14 Feb 2025 13:47:30 +0000 Subject: [PATCH 13/16] Add file changes from papi_cupti_common. --- src/components/cuda/papi_cupti_common.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c index 71d001bdd..3d74b0635 100644 --- a/src/components/cuda/papi_cupti_common.c +++ b/src/components/cuda/papi_cupti_common.c @@ -623,7 +623,6 @@ int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id) if (cuda_err != CUDA_SUCCESS) { return PAPI_EMISC; } - if (info[dev_id].ctx == NULL) { // Store current user created CUDA context cuda_err = cuCtxGetCurrentPtr(&info[dev_id].ctx); @@ -639,11 +638,14 @@ int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id) // for them. Note, that for machine with multiple devices, we need to // call cudaSetDevice. else { - cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC); - cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC); - - cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC); - cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC); + // Guard against a user PopCurrent + if (info[evt_dev_id].ctx == NULL) { + cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC); + cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC); + + cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC); + cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC); + } } return PAPI_OK; From dbd3a5e579eeea33c54bcbfe163796bec63cbedc Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Fri, 14 Feb 2025 15:19:25 +0000 Subject: [PATCH 14/16] Syntax changes/clean up. --- src/components/cuda/cupti_profiler.c | 10 ++++------ src/components/cuda/cupti_utils.h | 1 - src/components/cuda/linux-cuda.c | 1 - 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index 44a626681..49dc061a1 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -1601,7 +1601,7 @@ int cuptip_ctx_start(cuptip_control_t state) ERRDBG("Profiling same gpu from multiple event sets not allowed.\n"); return papi_errno; } - /* get the cuda context */ + /* get the cuda context */ papi_errno = cuptic_ctxarr_get_ctx(state->info, gpu_id, &ctx); /* bind the specified CUDA context to the calling CPU thread */ cudaCheckErrors( cuCtxSetCurrentPtr(ctx), goto fn_fail_misc ); @@ -1913,11 +1913,9 @@ int cuptip_shutdown(void) */ int evt_id_create(event_info_t *info, uint64_t *event_id) { - *event_id = (uint64_t)(info->device << DEVICE_SHIFT); *event_id |= (uint64_t)(info->flags << QLMASK_SHIFT); *event_id |= (uint64_t)(info->nameid << NAMEID_SHIFT); - return PAPI_OK; } @@ -1955,8 +1953,8 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info) */ int init_event_table(void) { - int dev_id, found, table_idx = 0; - int gpu_idx, i, listsubmetrics = 1, papi_errno = PAPI_OK; + int i, dev_id, found, table_idx = 0, papi_errno = PAPI_OK; + int listsubmetrics = 1 /* instatiate struct to collect the total metric count and metric names; instantiated here to avoid scoping issues */ @@ -2012,6 +2010,7 @@ int init_event_table(void) fn_exit: return papi_errno; fn_fail: + papi_errno = PAPI_EMISC; goto fn_exit; } @@ -2488,7 +2487,6 @@ static int evt_code_to_name(uint64_t event_code, char *name, int len) */ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) { - int papi_errno, i, gpu_id; char description[PAPI_HUGE_STR_LEN]; diff --git a/src/components/cuda/cupti_utils.h b/src/components/cuda/cupti_utils.h index 17c186111..742bda402 100644 --- a/src/components/cuda/cupti_utils.h +++ b/src/components/cuda/cupti_utils.h @@ -14,7 +14,6 @@ #include - typedef int64_t cuptiu_bitmap_t; typedef int (*cuptiu_dev_get_map_cb)(uint64_t event_id, int *dev_id); typedef NVPW_CUDA_MetricsContext_Create_Params MCCP_t; diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c index 4b1cf549c..3142d4d26 100644 --- a/src/components/cuda/linux-cuda.c +++ b/src/components/cuda/linux-cuda.c @@ -320,7 +320,6 @@ static int cuda_shutdown_thread(hwd_context_t *ctx) static int cuda_init_control_state(hwd_control_state_t __attribute__((unused)) *ctl) { COMPDBG("Entering.\n"); - //return PAPI_OK; return check_n_initialize(); } From d701c84dd6baebbf532effb1c3e5540a13de4840 Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Fri, 14 Feb 2025 15:45:38 +0000 Subject: [PATCH 15/16] Forgot a semicolon. --- src/components/cuda/cupti_profiler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index 49dc061a1..6dae222fb 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -1954,7 +1954,7 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info) int init_event_table(void) { int i, dev_id, found, table_idx = 0, papi_errno = PAPI_OK; - int listsubmetrics = 1 + int listsubmetrics = 1; /* instatiate struct to collect the total metric count and metric names; instantiated here to avoid scoping issues */ From 72443c62b0e0c796a2e2c6dccd9b8932c269c2c2 Mon Sep 17 00:00:00 2001 From: Treece Burgess Date: Thu, 20 Feb 2025 19:47:56 +0000 Subject: [PATCH 16/16] Update Cuda context creation --- src/components/cuda/cupti_profiler.c | 79 ++++++++++++------------- src/components/cuda/linux-cuda.c | 1 - src/components/cuda/papi_cupti_common.c | 48 ++++++++------- 3 files changed, 65 insertions(+), 63 deletions(-) diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index 6dae222fb..73a094a47 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -107,7 +107,7 @@ static int initialize_perfworks_api(void); /* utility functions to init metrics and cuda native event table */ static int init_all_metrics(void); -static void init_main_htable(void); +static int init_main_htable(void); static int init_event_table(void); static int shutdown_event_table(void); static void free_all_enumerated_metrics(void); @@ -136,7 +136,7 @@ static int calculate_num_passes(struct NVPA_RawMetricsConfig *pRawMetricsConfig, NVPA_RawMetricRequest *rmr, int *num_pass); /* functions to set and get cuda native event info or convert cuda native events */ -static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos, int gpu_id); +static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, int gpu_id); static int verify_events(uint64_t *events_id, int num_events, cuptip_control_t state); static int evt_id_to_info(uint64_t event_id, event_info_t *info); static int evt_id_create(event_info_t *info, uint64_t *event_id); @@ -179,7 +179,6 @@ NVPA_Status ( *NVPW_Profiler_CounterData_GetRangeDescriptionsPtr ) (NVPW_Profile NVPA_Status ( *NVPW_MetricsContext_SetCounterDataPtr ) (NVPW_MetricsContext_SetCounterData_Params* params); NVPA_Status ( *NVPW_MetricsContext_EvaluateToGpuValuesPtr ) (NVPW_MetricsContext_EvaluateToGpuValues_Params* params); NVPA_Status ( *NVPW_RawMetricsConfig_GetNumPassesPtr ) (NVPW_RawMetricsConfig_GetNumPasses_Params* params); -NVPA_Status ( *NVPW_RawMetricsConfig_GetNumPassesPtr_V2 ) (NVPW_RawMetricsConfig_GetNumPasses_V2_Params* params); NVPA_Status ( *NVPW_RawMetricsConfig_SetCounterAvailabilityPtr ) (NVPW_RawMetricsConfig_SetCounterAvailability_Params* params); NVPA_Status ( *NVPW_RawMetricsConfig_IsAddMetricsPossiblePtr ) (NVPW_RawMetricsConfig_IsAddMetricsPossible_Params* params); NVPA_Status ( *NVPW_MetricsContext_GetCounterNames_BeginPtr ) (NVPW_MetricsContext_GetCounterNames_Begin_Params* pParams); @@ -1289,11 +1288,7 @@ static int find_same_chipname(int gpu_id) static int init_all_metrics(void) { int gpu_id, papi_errno = PAPI_OK; - cuptiu_table_p->avail_gpu_info = (gpu_record_t *) papi_calloc(num_gpus, sizeof(gpu_record_t)); - if (cuptiu_table_p->avail_gpu_info == NULL) { - papi_errno = PAPI_ENOMEM; - goto fn_exit; - } + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { papi_errno = get_chip_name(gpu_id, cuptiu_table_p->avail_gpu_info[gpu_id].chip_name); if (papi_errno != PAPI_OK) { @@ -1363,9 +1358,9 @@ static void free_all_enumerated_metrics(void) /** @class init_main_htable * @brief Initialize the main htable used to collect metrics. */ -static void init_main_htable(void) +static int init_main_htable(void) { - int i, val = 1, base = 2; + int i, val = 1, base = 2, papi_errno = PAPI_OK; /* allocate (2 ^ NAMEID_WIDTH) metric names, this matches the number of bits for the event encoding format */ @@ -1374,13 +1369,31 @@ static void init_main_htable(void) } /* initialize struct */ - cuptiu_table_p = papi_malloc(sizeof(cuptiu_event_table_t)); + cuptiu_table_p = (cuptiu_event_table_t *) papi_malloc(sizeof(cuptiu_event_table_t)); + if (cuptiu_table_p == NULL) { + goto fn_fail; + } cuptiu_table_p->capacity = val; - cuptiu_table_p->count = 0; - cuptiu_table_p->events = papi_calloc(val, sizeof(cuptiu_event_t)); - + cuptiu_table_p->count = 0; + + cuptiu_table_p->events = (cuptiu_event_t *) papi_calloc(val, sizeof(cuptiu_event_t)); + if (cuptiu_table_p->events == NULL) { + goto fn_fail; + } + + cuptiu_table_p->avail_gpu_info = (gpu_record_t *) papi_calloc(num_gpus, sizeof(gpu_record_t)); + if (cuptiu_table_p->avail_gpu_info == NULL) { + goto fn_fail; + } + /* initialize the main hash table for metric collection */ htable_init(&cuptiu_table_p->htable); + + fn_exit: + return papi_errno; + fn_fail: + papi_errno = PAPI_ENOMEM; + goto fn_exit; } /** @class cuptip_init @@ -1417,7 +1430,10 @@ int cuptip_init(void) goto fn_fail; } - init_main_htable(); + papi_errno = init_main_htable(); + if (papi_errno != PAPI_OK) { + goto fn_fail; + } papi_errno = init_all_metrics(); if (papi_errno != PAPI_OK) { @@ -1581,13 +1597,8 @@ int cuptip_ctx_start(cuptip_control_t state) /* create a context handle */ CUcontext userCtx, ctx; - /* return the Cuda context bound to the calling CPU thread */ + // return the Cuda context bound to the calling CPU thread cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - /* if no context is found, create a context */ - if (userCtx == NULL) { - cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc ); - cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - } /* enumerate through all of the unique gpus */ for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { @@ -1658,10 +1669,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) CUcontext userCtx = NULL, ctx = NULL; cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - if (userCtx == NULL) { - cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc ); - cudaArtCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - } + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); if (gpu_ctl->added_events->count == 0) { @@ -1704,6 +1712,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) } for (i = 0; i < gpu_ctl->added_events->count; i++) { + printf("Device id: %d and counts: %d\n", gpu_id, counts[i]); evt_pos = gpu_ctl->added_events->evt_pos[i]; if (state->read_count == 0) { counter_vals[evt_pos] = counts[i]; @@ -1780,7 +1789,7 @@ int cuptip_ctx_reset(cuptip_control_t state) int i; for (i = 0; i < state->read_count; i++) { - state->counters[i] = 1; + state->counters[i] = 0; } state->read_count = 0; @@ -1802,13 +1811,7 @@ int cuptip_ctx_stop(cuptip_control_t state) cuptip_gpu_state_t *gpu_ctl; CUcontext userCtx = NULL, ctx = NULL; - - cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - if (userCtx == NULL) { - cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc ); - cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - } for (gpu_id=0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); @@ -1990,7 +1993,7 @@ int init_event_table(void) /* loop through metrics to add to overall event table */ for (i = 0; i < cuptiu_table_p->avail_gpu_info[table_idx].num_metrics; i++) { - papi_errno = get_ntv_events( cuptiu_table_p, cuptiu_table_p->avail_gpu_info[table_idx].metric_names[i], i, 0, dev_id); + papi_errno = get_ntv_events( cuptiu_table_p, cuptiu_table_p->avail_gpu_info[table_idx].metric_names[i], dev_id); if (papi_errno != PAPI_OK) goto fn_exit; } @@ -2016,22 +2019,16 @@ int init_event_table(void) } /** @class get_ntv_events - * @brief Add the event name, event code, and event position to the hash table. + * @brief Store Cuda native events and their corresponding device(s). * * @param *evt_table * Structure containing member variables such as name, evt_code, evt_pos, and htable. * @param *evt_name * Cuda native event name. - * @param evt_code - * Event code which corresponds to the Cuda native event name. - * @param evt_pos - * Position within the hash table. */ -static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos, int gpu_id) +static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, int gpu_id) { - int papi_errno; - char description[256]; int *count = &evt_table->count; cuptiu_event_t *events = evt_table->events; diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c index 3142d4d26..35d153808 100644 --- a/src/components/cuda/linux-cuda.c +++ b/src/components/cuda/linux-cuda.c @@ -515,7 +515,6 @@ static int cuda_reset(hwd_context_t __attribute__((unused)) *ctx, hwd_control_st return PAPI_EMISC; } - /* To-do: Understand how this connects to values, memory addresses are not the same. */ papi_errno = cuptid_ctx_reset(cuda_ctl->cuptid_ctx); return papi_errno; diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c index 3d74b0635..e11b6e2f5 100644 --- a/src/components/cuda/papi_cupti_common.c +++ b/src/components/cuda/papi_cupti_common.c @@ -609,44 +609,50 @@ int cuptic_ctxarr_create(cuptic_info_t *pinfo) */ int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id) { - int gpu_id; CUcontext pctx; CUresult cuda_err; CUdevice dev_id; - // See if a user created a CUDA context on the - // calling cpu thread. + // If a Cuda context already exists, get it cuda_err = cuCtxGetCurrentPtr(&pctx); - if (cuda_err == CUDA_SUCCESS && pctx != NULL) { + if (cuda_err != CUDA_SUCCESS) { + return PAPI_EMISC; + } + + // Cuda context was found + if (pctx != NULL) { //info[dev_id].ctx != NULL does not work + SUBDBG("A Cuda context was found.\n"); // Get the device id associated with the user created CUDA context cuda_err = cuCtxGetDevicePtr(&dev_id); if (cuda_err != CUDA_SUCCESS) { return PAPI_EMISC; } - if (info[dev_id].ctx == NULL) { - // Store current user created CUDA context + + if(info[dev_id].ctx == NULL) { + // Store current user created Cuda context cuda_err = cuCtxGetCurrentPtr(&info[dev_id].ctx); if (cuda_err != CUDA_SUCCESS) { return PAPI_EMISC; } } else if (info[dev_id].ctx != pctx) { - ERRDBG("Warning: cuda context for gpu %d has changed from %p to %p\n", gpu_id, info[gpu_id].ctx, pctx); + ERRDBG("Warning: cuda context for device %d has changed from %p to %p\n", dev_id, info[dev_id].ctx, pctx); } - } - // If a user did not create a CUDA context, then we will create one - // for them. Note, that for machine with multiple devices, we need to - // call cudaSetDevice. - else { - // Guard against a user PopCurrent - if (info[evt_dev_id].ctx == NULL) { - cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC); - cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC); - - cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC); - cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC); - } - } + } + // Cuda context was not found + // Note, that for machines with multiple devices, we need to + // call cudaSetDevice. + else { + // Guard against a user PopCurrent + if (info[evt_dev_id].ctx == NULL) { + SUBDBG("A Cuda context was not found. Therefore, one is created for: %d\n", evt_dev_id); + cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC); + cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC); + + cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC); + cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC); + } + } return PAPI_OK; }