diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index 31bae0ca1..73a094a47 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -47,9 +47,7 @@ typedef struct byte_array_s byte_array_t; typedef struct cuptip_gpu_state_s cuptip_gpu_state_t; -typedef struct list_metrics_s list_metrics_t; typedef struct NVPA_MetricsContext NVPA_MetricsContext; -typedef NVPW_CUDA_MetricsContext_Create_Params MCCP_t; typedef struct { int device; @@ -65,7 +63,7 @@ struct byte_array_s { struct cuptip_gpu_state_s { int gpu_id; - cuptiu_event_table_t *event_names; + cuptiu_event_table_t *added_events; int rmr_count; NVPA_RawMetricRequest *rmr; MCCP_t *pmetricsContextCreateParams; @@ -87,19 +85,11 @@ struct cuptip_control_s { cuptic_info_t info; }; -struct list_metrics_s { - char chip_name[32]; - MCCP_t *pmetricsContextCreateParams; - int num_metrics; - cuptiu_event_table_t *nv_metrics; -}; - static void *dl_nvpw; static int num_gpus; -static int num_unique_gpus = 1; -static list_metrics_t *avail_events; +static gpu_record_t *avail_gpu_info; -static cuptiu_event_table_t cuptiu_table; +/* main event table to store metrics */ static cuptiu_event_table_t *cuptiu_table_p; /* load and unload cuda function pointers */ @@ -117,6 +107,7 @@ static int initialize_perfworks_api(void); /* utility functions to init metrics and cuda native event table */ static int init_all_metrics(void); +static int init_main_htable(void); static int init_event_table(void); static int shutdown_event_table(void); static void free_all_enumerated_metrics(void); @@ -145,21 +136,21 @@ static int calculate_num_passes(struct NVPA_RawMetricsConfig *pRawMetricsConfig, NVPA_RawMetricRequest *rmr, int *num_pass); /* functions to set and get cuda native event info or convert cuda native events */ -static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos, int gpu_id); -static int verify_events(uint64_t *events_id, int num_events, cuptiu_event_table_t **targeted_event_names); +static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, int gpu_id); +static int verify_events(uint64_t *events_id, int num_events, cuptip_control_t state); static int evt_id_to_info(uint64_t event_id, event_info_t *info); static int evt_id_create(event_info_t *info, uint64_t *event_id); static int evt_code_to_name(uint64_t event_code, char *name, int len); static int evt_name_to_basename(const char *name, char *base, int len); static int evt_name_to_device(const char *name, int *device); static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const char *evt_name, - char *description, int gpu_id ); + char *description, const char *chip_name ); static int retrieve_metric_rmr( NVPA_MetricsContext *pMetricsContext, const char *evt_name, int *numDep, NVPA_RawMetricRequest **pRMR ); /* misc */ static int get_event_collection_method(const char *evt_name); -static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl); +static int get_added_events_rmr(cuptip_gpu_state_t *gpu_ctl); static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl); static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts); @@ -188,7 +179,6 @@ NVPA_Status ( *NVPW_Profiler_CounterData_GetRangeDescriptionsPtr ) (NVPW_Profile NVPA_Status ( *NVPW_MetricsContext_SetCounterDataPtr ) (NVPW_MetricsContext_SetCounterData_Params* params); NVPA_Status ( *NVPW_MetricsContext_EvaluateToGpuValuesPtr ) (NVPW_MetricsContext_EvaluateToGpuValues_Params* params); NVPA_Status ( *NVPW_RawMetricsConfig_GetNumPassesPtr ) (NVPW_RawMetricsConfig_GetNumPasses_Params* params); -NVPA_Status ( *NVPW_RawMetricsConfig_GetNumPassesPtr_V2 ) (NVPW_RawMetricsConfig_GetNumPasses_V2_Params* params); NVPA_Status ( *NVPW_RawMetricsConfig_SetCounterAvailabilityPtr ) (NVPW_RawMetricsConfig_SetCounterAvailability_Params* params); NVPA_Status ( *NVPW_RawMetricsConfig_IsAddMetricsPossiblePtr ) (NVPW_RawMetricsConfig_IsAddMetricsPossible_Params* params); NVPA_Status ( *NVPW_MetricsContext_GetCounterNames_BeginPtr ) (NVPW_MetricsContext_GetCounterNames_Begin_Params* pParams); @@ -468,7 +458,6 @@ static int initialize_perfworks_api(void) return PAPI_OK; } - static int get_chip_name(int dev_num, char* chipName) { int papi_errno; @@ -487,7 +476,7 @@ static int get_chip_name(int dev_num, char* chipName) return PAPI_OK; } -/** @class get_event_names_rmr +/** @class get_added_events_rmr * @brief For a Cuda native event name collect raw metrics and count * of raw metrics for collection. Raw Metrics are one layer of the Metric API * and contains the list of raw counters and generates configuration file @@ -497,7 +486,7 @@ static int get_chip_name(int dev_num, char* chipName) * Structure of type cuptip_gpu_state_t which has member variables such as * gpu_id, rmr, rmr_count, and more. */ -static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl) +static int get_added_events_rmr(cuptip_gpu_state_t *gpu_ctl) { COMPDBG("Entering.\n"); int gpu_id, num_dep, count_raw_metrics = 0, papi_errno = PAPI_OK; @@ -506,14 +495,12 @@ static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl) cuptiu_event_t *evt_rec; /* for each event in the event table collect the raw metric requests */ - for (i = 0; i < gpu_ctl->event_names->count; i++) { - /* Not using the correct global event names now.*/ + for (i = 0; i < gpu_ctl->added_events->count; i++) { papi_errno = retrieve_metric_rmr( gpu_ctl->pmetricsContextCreateParams->pMetricsContext, - gpu_ctl->event_names->added_cuda_evts[i], &num_dep, + gpu_ctl->added_events->cuda_evts[i], &num_dep, &collect_rmr ); - /* why is PAPI_ENOEVNT hard coded? */ if (papi_errno != PAPI_OK) { papi_errno = PAPI_ENOEVNT; goto fn_exit; @@ -631,7 +618,7 @@ static int nvpw_cuda_metricscontext_create(cuptip_control_t state) seee cuptip_gpu_state_s */ cuptip_gpu_state_t *gpu_ctl; - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); found = find_same_chipname(gpu_id); if (found > -1) { @@ -649,7 +636,7 @@ static int nvpw_cuda_metricscontext_create(cuptip_control_t state) /* setting metadata values */ pMCCP->structSize = NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE; - pMCCP->pChipName = avail_events[gpu_id].chip_name; + pMCCP->pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name; /* create context */ nvpa_err = NVPW_CUDA_MetricsContext_CreatePtr(pMCCP); @@ -677,7 +664,7 @@ static int nvpw_cuda_metricscontext_destroy(cuptip_control_t state) int gpu_id, found, papi_errno = PAPI_OK; cuptip_gpu_state_t *gpu_ctl; - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); found = find_same_chipname(gpu_id); if (found > -1) { @@ -714,13 +701,13 @@ static int check_multipass(cuptip_control_t state) NVPA_Status nvpa_err; cuptip_gpu_state_t *gpu_ctl; - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->event_names->count == 0) { + if (gpu_ctl->added_events->count == 0) { continue; } - papi_errno = get_event_names_rmr(gpu_ctl); + papi_errno = get_added_events_rmr(gpu_ctl); if (papi_errno != PAPI_OK) { goto fn_exit; } @@ -730,7 +717,7 @@ static int check_multipass(cuptip_control_t state) .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE, .pPriv = NULL, .activityKind = NVPA_ACTIVITY_KIND_PROFILER, - .pChipName = avail_events[gpu_id].chip_name, + .pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name, }; nvpa_err = NVPW_CUDA_RawMetricsConfig_CreatePtr( &nvpw_metricsConfigCreateParams @@ -807,7 +794,7 @@ static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl) /** @class metric_get_config_image * @brief Retrieves binary ConfigImage for the Cuda native event metrics listed - * for collection. The function get_event_names_rmr( ... ) must be + * for collection. The function get_added_events_rmr( ... ) must be * called before this step is possible. * @param *gpu_ctl * Structure of type cuptip_gpu_state_t which has member variables such as @@ -816,11 +803,13 @@ static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl) static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) { COMPDBG("Entering.\n"); + int gpu_id = gpu_ctl->gpu_id; + NVPW_CUDA_RawMetricsConfig_Create_Params nvpw_metricsConfigCreateParams = { .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE, .pPriv = NULL, .activityKind = NVPA_ACTIVITY_KIND_PROFILER, - .pChipName = avail_events[gpu_ctl->gpu_id].chip_name, + .pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name, }; nvpwCheckErrors( NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams), goto fn_fail ); @@ -832,7 +821,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) .pCounterAvailabilityImage = gpu_ctl->counterAvailabilityImage.data, }; nvpwCheckErrors( NVPW_RawMetricsConfig_SetCounterAvailabilityPtr(&setCounterAvailabilityParams), goto fn_fail ); - }; + } /* NOTE: maxPassCount is being set to 1 as a final safety net to limit metric collection to a single pass. Metrics that require multiple passes would fail further down at AddMetrics due to this. @@ -903,7 +892,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) /** @class metric_get_counter_data_prefix_image * @brief Retrieves binary CounterDataPrefix for the Cuda native event metrics - * listed for collection. The function get_event_names_rmr( ... ) + * listed for collection. The function get_added_events_rmr( ... ) * must be called before this step is possible. * @param *gpu_ctl * Structure of type cuptip_gpu_state_t which has member variables such as @@ -912,10 +901,12 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl) static int metric_get_counter_data_prefix_image(cuptip_gpu_state_t *gpu_ctl) { COMPDBG("Entering.\n"); + int gpu_id = gpu_ctl->gpu_id; + NVPW_CounterDataBuilder_Create_Params counterDataBuilderCreateParams = { .structSize = NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE, .pPriv = NULL, - .pChipName = avail_events[gpu_ctl->gpu_id].chip_name, + .pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name, }; nvpwCheckErrors( NVPW_CounterDataBuilder_CreatePtr(&counterDataBuilderCreateParams), goto fn_fail ); @@ -1118,8 +1109,9 @@ static int begin_profiling(cuptip_gpu_state_t *gpu_ctl) }; cuptiCheckErrors( cuptiProfilerEnableProfilingPtr(&enableProfilingParams), goto fn_fail ); - char rangeName[64]; - sprintf(rangeName, "PAPI_Range_%d", gpu_ctl->gpu_id); + char rangeName[PAPI_MIN_STR_LEN]; + int gpu_id = gpu_ctl->gpu_id; + sprintf(rangeName, "PAPI_Range_%d", gpu_id); CUpti_Profiler_PushRange_Params pushRangeParams = { .structSize = CUpti_Profiler_PushRange_Params_STRUCT_SIZE, .pPriv = NULL, @@ -1207,7 +1199,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts) { COMPDBG("eval_metric_values. dev = %d\n", gpu_ctl->gpu_id); int i, papi_errno = PAPI_OK; - int numMetrics = gpu_ctl->event_names->count; + int numMetrics = gpu_ctl->added_events->count; double *gpuValues; char **metricNames; @@ -1231,7 +1223,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts) } for (i = 0; i < numMetrics; i++) { - metricNames[i] = gpu_ctl->event_names->added_cuda_evts[i]; + metricNames[i] = gpu_ctl->added_events->cuda_evts[i]; LOGDBG("Setting metric name %s\n", metricNames[i]); } @@ -1258,7 +1250,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts) nvpwCheckErrors( NVPW_MetricsContext_EvaluateToGpuValuesPtr(&evalToGpuParams), goto fn_fail ); /* store the gpu values */ - for (i = 0; i < (int) gpu_ctl->event_names->count; i++) { + for (i = 0; i < (int) gpu_ctl->added_events->count; i++) { counts[i] = gpuValues[i]; } @@ -1282,14 +1274,13 @@ static int find_same_chipname(int gpu_id) { int i; for (i = 0; i < gpu_id; i++) { - if (!strcmp(avail_events[gpu_id].chip_name, avail_events[i].chip_name)) { + if (!strcmp(cuptiu_table_p->avail_gpu_info[gpu_id].chip_name, cuptiu_table_p->avail_gpu_info[i].chip_name)) { return i; } } return -1; } - /** @class init_all_metrics * @brief Initialize metrics for a specific GPU. * @@ -1297,22 +1288,18 @@ static int find_same_chipname(int gpu_id) static int init_all_metrics(void) { int gpu_id, papi_errno = PAPI_OK; - avail_events = (list_metrics_t *) papi_calloc(num_unique_gpus, sizeof(list_metrics_t)); - if (avail_events == NULL) { - papi_errno = PAPI_ENOMEM; - goto fn_exit; - } - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { - papi_errno = get_chip_name(gpu_id, avail_events[gpu_id].chip_name); + + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { + papi_errno = get_chip_name(gpu_id, cuptiu_table_p->avail_gpu_info[gpu_id].chip_name); if (papi_errno != PAPI_OK) { goto fn_exit; } } int found; - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { found = find_same_chipname(gpu_id); if (found > -1) { - avail_events[gpu_id].pmetricsContextCreateParams = avail_events[found].pmetricsContextCreateParams; + cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = cuptiu_table_p->avail_gpu_info[found].pmetricsContextCreateParams; continue; } MCCP_t *pMCCP = (MCCP_t *) papi_calloc(1, sizeof(MCCP_t)); @@ -1321,10 +1308,10 @@ static int init_all_metrics(void) goto fn_exit; } pMCCP->structSize = NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE; - pMCCP->pChipName = avail_events[gpu_id].chip_name; + pMCCP->pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name; nvpwCheckErrors( NVPW_CUDA_MetricsContext_CreatePtr(pMCCP), goto fn_fail ); - avail_events[gpu_id].pmetricsContextCreateParams = pMCCP; + cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = pMCCP; } fn_exit: @@ -1342,34 +1329,71 @@ static void free_all_enumerated_metrics(void) COMPDBG("Entering.\n"); int gpu_id, found; NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams; - if (avail_events == NULL) { + if (cuptiu_table_p->avail_gpu_info == NULL) { return; } - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { found = find_same_chipname(gpu_id); if (found > -1) { - avail_events[gpu_id].num_metrics = 0; - avail_events[gpu_id].nv_metrics = NULL; - avail_events[gpu_id].pmetricsContextCreateParams = NULL; + cuptiu_table_p->avail_gpu_info[gpu_id].num_metrics = 0; + cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = NULL; continue; } - if (avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext) { + if (cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pMetricsContext) { metricsContextDestroyParams = (NVPW_MetricsContext_Destroy_Params) { .structSize = NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE, .pPriv = NULL, - .pMetricsContext = avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext, + .pMetricsContext = cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pMetricsContext, }; nvpwCheckErrors(NVPW_MetricsContext_DestroyPtr(&metricsContextDestroyParams), ); } - papi_free(avail_events[gpu_id].pmetricsContextCreateParams); - avail_events[gpu_id].pmetricsContextCreateParams = NULL; + papi_free(cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams); + cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = NULL; - if (avail_events[gpu_id].nv_metrics) { - cuptiu_event_table_destroy( &(avail_events[gpu_id].nv_metrics) ); - } } - papi_free(avail_events); - avail_events = NULL; + papi_free(cuptiu_table_p->avail_gpu_info); + cuptiu_table_p->avail_gpu_info = NULL; +} + +/** @class init_main_htable + * @brief Initialize the main htable used to collect metrics. +*/ +static int init_main_htable(void) +{ + int i, val = 1, base = 2, papi_errno = PAPI_OK; + + /* allocate (2 ^ NAMEID_WIDTH) metric names, this matches the + number of bits for the event encoding format */ + for (i = 0; i < NAMEID_WIDTH; i++) { + val *= base; + } + + /* initialize struct */ + cuptiu_table_p = (cuptiu_event_table_t *) papi_malloc(sizeof(cuptiu_event_table_t)); + if (cuptiu_table_p == NULL) { + goto fn_fail; + } + cuptiu_table_p->capacity = val; + cuptiu_table_p->count = 0; + + cuptiu_table_p->events = (cuptiu_event_t *) papi_calloc(val, sizeof(cuptiu_event_t)); + if (cuptiu_table_p->events == NULL) { + goto fn_fail; + } + + cuptiu_table_p->avail_gpu_info = (gpu_record_t *) papi_calloc(num_gpus, sizeof(gpu_record_t)); + if (cuptiu_table_p->avail_gpu_info == NULL) { + goto fn_fail; + } + + /* initialize the main hash table for metric collection */ + htable_init(&cuptiu_table_p->htable); + + fn_exit: + return papi_errno; + fn_fail: + papi_errno = PAPI_ENOMEM; + goto fn_exit; } /** @class cuptip_init @@ -1392,13 +1416,12 @@ int cuptip_init(void) if (papi_errno != PAPI_OK) { goto fn_fail; } - - /* if no gpu's are found exit */ + if (num_gpus <= 0) { cuptic_disabled_reason_set("No GPUs found on system."); goto fn_fail; } - + /* initialize cupti profiler and perfworks api */ papi_errno = initialize_cupti_profiler_api(); papi_errno += initialize_perfworks_api(); @@ -1407,67 +1430,81 @@ int cuptip_init(void) goto fn_fail; } + papi_errno = init_main_htable(); + if (papi_errno != PAPI_OK) { + goto fn_fail; + } + papi_errno = init_all_metrics(); if (papi_errno != PAPI_OK) { goto fn_fail; } + + /* collect metrics */ + init_event_table(); + papi_errno = cuInitPtr(0); if (papi_errno != CUDA_SUCCESS) { cuptic_disabled_reason_set("Failed to initialize CUDA driver API."); goto fn_fail; } - /* initialize hash table with cuda native events */ - init_event_table(); - cuptiu_table_p = &cuptiu_table; - return PAPI_OK; fn_fail: return PAPI_EMISC; } /** @class verify_events - * @brief Verify user added events and create a subset table to be used for - * start, stop, etc. + * @brief Verify user added events and store metadata i.e. metric names + * and device id's . * @param *events_id * Cuda native event id's. * @param num_events * Number of Cuda native events a user is wanting to count. - * @param **targeted_event_names - * Event table to hold subset of user added events. + * @param state + * Struct that holds read count, running, cuptip_info_t, and + * cuptip_gpu_state_t. */ int verify_events(uint64_t *events_id, int num_events, - cuptiu_event_table_t **targeted_event_names) + cuptip_control_t state) { - int papi_errno = PAPI_OK, i; - char name[PAPI_MAX_STR_LEN] = { 0 }; - - papi_errno = cuptiu_event_table_create_init_capacity( - num_events * num_gpus, - sizeof(cuptiu_event_t), targeted_event_names - ); - if (papi_errno != PAPI_OK) { - goto fn_exit; - } + int papi_errno, i; + char *metricName; + int idx; + + for (i = 0; i < num_gpus; i++) { + papi_errno = cuptiu_event_table_create_init_capacity( + num_events, + sizeof(cuptiu_event_t), &(state->gpu_ctl[i].added_events) + ); + if (papi_errno != PAPI_OK) { + return papi_errno; + } + } for (i = 0; i < num_events; i++) { event_info_t info; papi_errno = evt_id_to_info(events_id[i], &info); if (papi_errno != PAPI_OK) { - break; + return papi_errno; } - sprintf(name, "%s", cuptiu_table_p->events[info.nameid].name); - strcpy((*targeted_event_names)->added_cuda_evts[i], name); - (*targeted_event_names)->added_cuda_dev[i] = info.device; + + /* for a specific device table, get the current event index */ + idx = state->gpu_ctl[info.device].added_events->count; + + metricName = state->gpu_ctl[info.device].added_events->cuda_evts[idx]; + snprintf(metricName, PAPI_MAX_STR_LEN, "%s", cuptiu_table_p->events[info.nameid].name); + void *p; - if (htable_find(cuptiu_table_p->htable, name, (void **) &p) != HTABLE_SUCCESS) { - htable_insert((*targeted_event_names)->htable, name, (void **) &p ); + if (htable_find(cuptiu_table_p->htable, metricName, (void **) &p) != HTABLE_SUCCESS) { + return PAPI_ENOEVNT; } - (*targeted_event_names)->count++; + state->gpu_ctl[info.device].added_events->cuda_devs[idx] = info.device; + state->gpu_ctl[info.device].added_events->evt_pos[idx] = i; + state->gpu_ctl[info.device].added_events->count++; /* total number of events added for a specific device */ } - fn_exit: - return papi_errno; + return PAPI_OK; } /** @class cuptip_ctx_create @@ -1487,47 +1524,51 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t int papi_errno = PAPI_OK, gpu_id, i; long long *counters = NULL; char name[PAPI_2MAX_STR_LEN] = { 0 }; - cuptiu_event_table_t *targeted_event_names; - - papi_errno = verify_events(events_id, num_events, &targeted_event_names); - if (papi_errno != PAPI_OK) { - return papi_errno; - } - /* create a cuptip_control_t struct which contains read_count, running, cupti_info_t and cuptip_gpu_state_t */ cuptip_control_t state = (cuptip_control_t) papi_calloc (1, sizeof(struct cuptip_control_s)); if (state == NULL) { return PAPI_ENOMEM; } - /* allocate memory for the total number of gpus for the cuptip_gpu_state_t struct - with the device qualifier refactor we only want to count the total number of unique gpus */ - state->gpu_ctl = (cuptip_gpu_state_t *) papi_calloc(num_unique_gpus, sizeof(cuptip_gpu_state_t)); + state->gpu_ctl = (cuptip_gpu_state_t *) papi_calloc(num_gpus, sizeof(cuptip_gpu_state_t)); if (state->gpu_ctl == NULL) { return PAPI_ENOMEM; } counters = papi_malloc(num_events * sizeof(*counters)); + if (counters == NULL) { + return PAPI_ENOMEM; + } - /* for each unique gpu store the gpu id for that gpu index */ - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { state->gpu_ctl[gpu_id].gpu_id = gpu_id; - state->gpu_ctl[gpu_id].event_names = targeted_event_names; } + event_info_t info; + papi_errno = evt_id_to_info(events_id[num_events - 1], &info); + if (papi_errno != PAPI_OK) { + return papi_errno; + } + /* register the user created cuda context for the current gpu if not already known */ - papi_errno = cuptic_ctxarr_update_current(thr_info); + papi_errno = cuptic_ctxarr_update_current(thr_info, info.device); if (papi_errno != PAPI_OK) { goto fn_exit; } - /* creates a pMetricsContext */ + /* create a MetricsContext */ papi_errno = nvpw_cuda_metricscontext_create(state); if (papi_errno != PAPI_OK) { goto fn_exit; } - /* multipass is not supporter; therefore, we must check the Cuda native event */ + /* verify user added events are available on the machine */ + papi_errno = verify_events(events_id, num_events, state); + if (papi_errno != PAPI_OK) { + goto fn_exit; + } + + /* check to make sure added events do not require multiple passes */ papi_errno = check_multipass(state); if (papi_errno != PAPI_OK) { goto fn_exit; @@ -1556,27 +1597,22 @@ int cuptip_ctx_start(cuptip_control_t state) /* create a context handle */ CUcontext userCtx, ctx; - /* return the Cuda context bound to the calling CPU thread */ + // return the Cuda context bound to the calling CPU thread cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - /* if no context is found, create a context */ - if (userCtx == NULL) { - cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc ); - cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - } /* enumerate through all of the unique gpus */ - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->event_names->count == 0) { + if (gpu_ctl->added_events->count == 0) { continue; } - LOGDBG("Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->event_names->count, gpu_ctl->rmr_count); - papi_errno = cuptic_device_acquire(state->gpu_ctl[gpu_id].event_names); + LOGDBG("Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->added_events->count, gpu_ctl->rmr_count); + papi_errno = cuptic_device_acquire(state->gpu_ctl[gpu_id].added_events); if (papi_errno != PAPI_OK) { ERRDBG("Profiling same gpu from multiple event sets not allowed.\n"); return papi_errno; } - /* get the cuda context for the unique gpu */ + /* get the cuda context */ papi_errno = cuptic_ctxarr_get_ctx(state->info, gpu_id, &ctx); /* bind the specified CUDA context to the calling CPU thread */ cudaCheckErrors( cuCtxSetCurrentPtr(ctx), goto fn_fail_misc ); @@ -1627,20 +1663,16 @@ int cuptip_ctx_start(cuptip_control_t state) int cuptip_ctx_read(cuptip_control_t state, long long **counters) { COMPDBG("Entering.\n"); - int papi_errno, gpu_id, i, j = 0, method; + int papi_errno, gpu_id, i, j = 0, method, evt_pos; long long counts[30], *counter_vals = state->counters; cuptip_gpu_state_t *gpu_ctl = NULL; CUcontext userCtx = NULL, ctx = NULL; cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - if (userCtx == NULL) { - cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc ); - cudaArtCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - } - for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->event_names->count == 0) { + if (gpu_ctl->added_events->count == 0) { continue; } @@ -1678,22 +1710,25 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) if (papi_errno != PAPI_OK) { goto fn_exit; } - for (i = 0; i < (int) gpu_ctl->event_names->count; i++) { + + for (i = 0; i < gpu_ctl->added_events->count; i++) { + printf("Device id: %d and counts: %d\n", gpu_id, counts[i]); + evt_pos = gpu_ctl->added_events->evt_pos[i]; if (state->read_count == 0) { - counter_vals[i] = counts[i]; + counter_vals[evt_pos] = counts[i]; } else { /* determine collection method such as max, min, sum, and avg for an added Cuda native event */ - method = get_event_collection_method(gpu_ctl->event_names->added_cuda_evts[i]); + method = get_event_collection_method(gpu_ctl->added_events->cuda_evts[i]); switch (method) { case CUDA_SUM: - counter_vals[i] += counts[i]; + counter_vals[evt_pos] += counts[i]; break; case CUDA_MIN: - counter_vals[i] = counter_vals[i] < counts[i] ? counter_vals[i] : counts[i]; + counter_vals[evt_pos] = counter_vals[evt_pos] < counts[i] ? counter_vals[evt_pos] : counts[i]; break; case CUDA_MAX: - counter_vals[i] = counter_vals[i] > counts[i] ? counter_vals[i] : counts[i]; + counter_vals[evt_pos] = counter_vals[evt_pos] > counts[i] ? counter_vals[evt_pos] : counts[i]; break; case CUDA_AVG: /* (size * average + value) / (size + 1) @@ -1701,15 +1736,15 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) average - current average value - number to add to the average */ - counter_vals[i] = (state->read_count * counter_vals[j++] + counts[i]) / (state->read_count + 1); + counter_vals[evt_pos] = (state->read_count * counter_vals[j++] + counts[i]) / (state->read_count + 1); break; default: - counter_vals[i] = counts[i]; + counter_vals[evt_pos] = counts[i]; break; } } } - *counters = state->counters; + *counters = counter_vals; cuptiCheckErrors( cuptiProfilerCounterDataImageInitializePtr(&gpu_ctl->initializeParams), goto fn_fail_misc ); cuptiCheckErrors( cuptiProfilerCounterDataImageInitializeScratchBufferPtr(&gpu_ctl->initScratchBufferParams), goto fn_fail_misc ); @@ -1721,7 +1756,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters) }; cuptiCheckErrors( cuptiProfilerBeginPassPtr(&beginPassParams), goto fn_fail_misc ); - char rangeName[64]; + char rangeName[PAPI_MIN_STR_LEN]; sprintf(rangeName, "PAPI_Range_%d", gpu_ctl->gpu_id); CUpti_Profiler_PushRange_Params pushRangeParams = { .structSize = CUpti_Profiler_PushRange_Params_STRUCT_SIZE, @@ -1754,7 +1789,7 @@ int cuptip_ctx_reset(cuptip_control_t state) int i; for (i = 0; i < state->read_count; i++) { - state->counters[i] = 1; + state->counters[i] = 0; } state->read_count = 0; @@ -1776,17 +1811,11 @@ int cuptip_ctx_stop(cuptip_control_t state) cuptip_gpu_state_t *gpu_ctl; CUcontext userCtx = NULL, ctx = NULL; - - cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - if (userCtx == NULL) { - cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc ); - cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc ); - } - for (gpu_id=0; gpu_id < num_unique_gpus; gpu_id++) { + for (gpu_id=0; gpu_id < num_gpus; gpu_id++) { gpu_ctl = &(state->gpu_ctl[gpu_id]); - if (gpu_ctl->event_names->count == 0) { + if (gpu_ctl->added_events->count == 0) { continue; } papi_errno = cuptic_ctxarr_get_ctx(state->info, gpu_id, &ctx); @@ -1795,7 +1824,7 @@ int cuptip_ctx_stop(cuptip_control_t state) if (papi_errno != PAPI_OK) { goto fn_fail; } - papi_errno = cuptic_device_release(state->gpu_ctl[gpu_id].event_names); + papi_errno = cuptic_device_release(state->gpu_ctl[gpu_id].added_events); if (papi_errno != PAPI_OK) { goto fn_fail; } @@ -1823,9 +1852,9 @@ int cuptip_ctx_destroy(cuptip_control_t *pstate) cuptip_control_t state = *pstate; int i, j; int papi_errno = nvpw_cuda_metricscontext_destroy(state); - for (i = 0; i < num_unique_gpus; i++) { + for (i = 0; i < num_gpus; i++) { reset_cupti_prof_config_images( &(state->gpu_ctl[i]) ); - cuptiu_event_table_destroy( &(state->gpu_ctl[i].event_names) ); + cuptiu_event_table_destroy( &(state->gpu_ctl[i].added_events) ); for (j = 0; j < state->gpu_ctl[i].rmr_count; j++) { papi_free((void *) state->gpu_ctl[i].rmr[j].pMetricName); } @@ -1915,10 +1944,6 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info) return PAPI_ENOEVNT; } - if (cuptiu_dev_check(cuptiu_table_p->events[info->nameid].device_map, info->device) == 0) { - return PAPI_ENOEVNT; - } - if (info->nameid >= cuptiu_table_p->count) { return PAPI_ENOEVNT; } @@ -1931,48 +1956,56 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info) */ int init_event_table(void) { - int gpu_idx, dev_id, i, listsubmetrics = 1, papi_errno = PAPI_OK; - - for (gpu_idx = 0; gpu_idx < num_unique_gpus; gpu_idx++) { - NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = { - .structSize = NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE, - .pPriv = NULL, - .pMetricsContext = avail_events[gpu_idx].pmetricsContextCreateParams->pMetricsContext, - .hidePeakSubMetrics = !listsubmetrics, - .hidePerCycleSubMetrics = !listsubmetrics, - .hidePctOfPeakSubMetrics = !listsubmetrics, - }; - nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_BeginPtr(&getMetricNameBeginParams), goto fn_fail ); + int i, dev_id, found, table_idx = 0, papi_errno = PAPI_OK; + int listsubmetrics = 1; - avail_events[gpu_idx].num_metrics = getMetricNameBeginParams.numMetrics; - cuptiu_table.events = papi_calloc(avail_events[gpu_idx].num_metrics, sizeof(cuptiu_event_t)); - if (cuptiu_table.events == NULL) { - papi_errno = PAPI_ENOMEM; - goto fn_fail; + /* instatiate struct to collect the total metric count and metric names; + instantiated here to avoid scoping issues */ + NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = { NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE }; + + /* loop through all available devices on the current system */ + for (dev_id = 0; dev_id < num_gpus; dev_id++) { + found = find_same_chipname(dev_id); + /* unique device found, collect metadata */ + if (found == -1) { + /* increment table index */ + if (dev_id > 0) + table_idx++; + + /* assigning values to member variables */ + getMetricNameBeginParams.pPriv = NULL; + getMetricNameBeginParams.pMetricsContext = cuptiu_table_p->avail_gpu_info[table_idx].pmetricsContextCreateParams->pMetricsContext; + getMetricNameBeginParams.hidePeakSubMetrics = !listsubmetrics; + getMetricNameBeginParams.hidePerCycleSubMetrics = !listsubmetrics; + getMetricNameBeginParams.hidePctOfPeakSubMetrics = !listsubmetrics; + + nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_BeginPtr(&getMetricNameBeginParams), goto fn_fail ); + + /* for each unique device found, store both the total number of metrics and metric names */ + cuptiu_table_p->avail_gpu_info[table_idx].num_metrics = getMetricNameBeginParams.numMetrics; + cuptiu_table_p->avail_gpu_info[table_idx].metric_names = getMetricNameBeginParams.ppMetricNames; } - - papi_errno = cuptiu_event_table_create_init_capacity(avail_events[gpu_idx].num_metrics * num_gpus, sizeof(cuptiu_event_t), &(avail_events[gpu_idx].nv_metrics)); - if (papi_errno != PAPI_OK) { - goto fn_exit; + /* device metadata already collected, set table index */ + else { + /* set table_idx to */ + table_idx = found; } - for (dev_id = 0; dev_id < num_gpus; dev_id++) { - for (i = 0; i < avail_events[gpu_idx].num_metrics; i++) { - papi_errno = get_ntv_events( avail_events[gpu_idx].nv_metrics, - getMetricNameBeginParams.ppMetricNames[i], - i, 0, dev_id ); - if (papi_errno != PAPI_OK) { - goto fn_exit; - } - } + + /* loop through metrics to add to overall event table */ + for (i = 0; i < cuptiu_table_p->avail_gpu_info[table_idx].num_metrics; i++) { + papi_errno = get_ntv_events( cuptiu_table_p, cuptiu_table_p->avail_gpu_info[table_idx].metric_names[i], dev_id); + if (papi_errno != PAPI_OK) + goto fn_exit; } - cuptiu_table.events = papi_realloc(cuptiu_table.events, avail_events[gpu_idx].nv_metrics->count * sizeof(cuptiu_event_t)); - cuptiu_table.count = avail_events[gpu_idx].nv_metrics->count; - cuptiu_table.htable = avail_events[gpu_idx].nv_metrics->htable; + } + + /* free memory */ + for (i = 0; i < table_idx; i++) { NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = { .structSize = NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE, .pPriv = NULL, - .pMetricsContext = avail_events[gpu_idx].pmetricsContextCreateParams->pMetricsContext, + .pMetricsContext = cuptiu_table_p->avail_gpu_info[table_idx].pmetricsContextCreateParams->pMetricsContext, }; nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_EndPtr((NVPW_MetricsContext_GetMetricNames_End_Params *) &getMetricNameEndParams), goto fn_fail ); } @@ -1980,39 +2013,33 @@ int init_event_table(void) fn_exit: return papi_errno; fn_fail: + papi_errno = PAPI_EMISC; goto fn_exit; } /** @class get_ntv_events - * @brief Add the event name, event code, and event position to the hash table. + * @brief Store Cuda native events and their corresponding device(s). * * @param *evt_table * Structure containing member variables such as name, evt_code, evt_pos, and htable. * @param *evt_name * Cuda native event name. - * @param evt_code - * Event code which corresponds to the Cuda native event name. - * @param evt_pos - * Position within the hash table. */ -static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos, int gpu_id) +static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, int gpu_id) { - int papi_errno; - char description[256]; int *count = &evt_table->count; - cuptiu_event_t *events = cuptiu_table.events; - + cuptiu_event_t *events = evt_table->events; + /* check to see if evt_name argument has been provided */ if (evt_name == NULL) { return PAPI_EINVAL; } /* check to see if capacity has been correctly allocated */ - if (evt_table->count >= evt_table->capacity) { - printf("Table count is larger than allocated capacity.\n"); - return PAPI_ENOMEM; + if (*count >= evt_table->capacity) { + return PAPI_EBUG; } cuptiu_event_t *event; @@ -2063,7 +2090,7 @@ static int shutdown_event_table(void) * @param gpu_id * Device number, e.g. 0, 1, 2, ... ,etc. */ -static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const char *evt_name, char *description, int gpu_id) +static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const char *evt_name, char *description, const char *chip_name) { COMPDBG("Entering.\n"); int num_dep, i, len, passes, papi_errno; @@ -2134,7 +2161,7 @@ static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const ch .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE, .pPriv = NULL, // assign to NULL .activityKind = NVPA_ACTIVITY_KIND_PROFILER, - .pChipName = avail_events[gpu_id].chip_name, + .pChipName = chip_name, }; nvpa_err = NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams); if (nvpa_err != NVPA_STATUS_SUCCESS) { @@ -2348,7 +2375,6 @@ int cuptip_evt_code_to_descr(uint64_t event_code, char *descr, int len) */ int cuptip_evt_name_to_code(const char *name, uint64_t *event_code) { - int htable_errno, device, flags, nameid, papi_errno = PAPI_OK; cuptiu_event_t *event; char base[PAPI_MAX_STR_LEN] = { 0 }; @@ -2403,7 +2429,7 @@ int cuptip_evt_name_to_code(const char *name, uint64_t *event_code) * Maximum alloted characters for base Cuda native event name. */ int cuptip_evt_code_to_name(uint64_t event_code, char *name, int len) -{ +{ return evt_code_to_name(event_code, name, len); } @@ -2458,19 +2484,28 @@ static int evt_code_to_name(uint64_t event_code, char *name, int len) */ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) { + int papi_errno, i, gpu_id; + char description[PAPI_HUGE_STR_LEN]; - int papi_errno, len, gpu_id; + /* get the events nameid and flags */ event_info_t inf; - char description[PAPI_HUGE_STR_LEN]; papi_errno = evt_id_to_info(event_code, &inf); if (papi_errno != PAPI_OK) { return papi_errno; } - /* collect the description and calculated numpass for a specific nameid */ - if (cuptiu_table_p->events[inf.nameid].desc[0] == 0) { - papi_errno = retrieve_metric_descr( avail_events[0].pmetricsContextCreateParams->pMetricsContext, - cuptiu_table_p->events[inf.nameid].name, cuptiu_table_p->events[inf.nameid].desc, 0 ); + /* collect the description and calculated numpass for the Cuda event */ + if (cuptiu_table_p->events[inf.nameid].desc[0] == '\0') { + /* find a matching device id to get correct MetricsContext and chip name */ + for (i = 0; i < num_gpus; ++i) { + if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) { + gpu_id = i; + break; + } + } + papi_errno = retrieve_metric_descr( cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pMetricsContext, + cuptiu_table_p->events[inf.nameid].name, cuptiu_table_p->events[inf.nameid].desc, + cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pChipName ); if (papi_errno != PAPI_OK) { return papi_errno; } @@ -2478,30 +2513,32 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info) switch (inf.flags) { case (0): - /* cuda native event name */ + /* store details for the Cuda event */ snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s", cuptiu_table_p->events[inf.nameid].name ); - /* cuda native event short description */ snprintf( info->short_descr, PAPI_MIN_STR_LEN, "%s", cuptiu_table_p->events[inf.nameid].desc ); - /* cuda native event long description */ snprintf( info->long_descr, PAPI_HUGE_STR_LEN, "%s", cuptiu_table_p->events[inf.nameid].desc ); break; case DEVICE_FLAG: { - int i; + int init_metric_dev_id; char devices[PAPI_MAX_STR_LEN] = { 0 }; for (i = 0; i < num_gpus; ++i) { if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) { + /* for an event, store the first device found to use with :device=#, + as on a heterogenous system events may not appear on each device */ + if (devices[0] == '\0') { + init_metric_dev_id = i; + } + sprintf(devices + strlen(devices), "%i,", i); } } *(devices + strlen(devices) - 1) = 0; - /* cuda native event name */ - snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s:device=%i", cuptiu_table_p->events[inf.nameid].name, inf.device ); - /* cuda native event short description */ + /* store details for the Cuda event */ + snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s:device=%i", cuptiu_table_p->events[inf.nameid].name, init_metric_dev_id ); snprintf( info->short_descr, PAPI_MIN_STR_LEN, "%s masks:Mandatory device qualifier [%s]", cuptiu_table_p->events[inf.nameid].desc, devices ); - /* cuda native event long description */ snprintf( info->long_descr, PAPI_HUGE_STR_LEN, "%s masks:Mandatory device qualifier [%s]", cuptiu_table_p->events[inf.nameid].desc, devices ); break; @@ -2541,8 +2578,9 @@ static int evt_name_to_basename(const char *name, char *base, int len) } /** @class evt_name_to_device - * @brief Take a Cuda native event name with a device qualifer appended to - * it and collect the device number. + * @brief Return the device number for a user provided Cuda native event. + * This can be done with a device qualifier present (:device=#) or + * we internally find the first device the native event exists for. * @param *name * Cuda native event name with a device qualifier appended. * @param *device @@ -2551,11 +2589,27 @@ static int evt_name_to_basename(const char *name, char *base, int len) static int evt_name_to_device(const char *name, int *device) { char *p = strstr(name, ":device="); + // User did provide :device=# qualifier if (p) { *device = (int) strtol(p + strlen(":device="), NULL, 10); } + // User did not provide :device=# qualifier else { - *device = 0; + int i, htable_errno; + cuptiu_event_t *event; + + htable_errno = htable_find(cuptiu_table_p->htable, name, (void **) &event); + if (htable_errno != HTABLE_SUCCESS) { + return PAPI_EINVAL; + } + + // Search for the first device the event exists for. + for (i = 0; i < num_gpus; ++i) { + if (cuptiu_dev_check(event->device_map, i)) { + *device = i; + break; + } + } } return PAPI_OK; } diff --git a/src/components/cuda/cupti_utils.h b/src/components/cuda/cupti_utils.h index b79dd5130..742bda402 100644 --- a/src/components/cuda/cupti_utils.h +++ b/src/components/cuda/cupti_utils.h @@ -9,10 +9,14 @@ #define __CUPTI_UTILS_H__ #include + +#include + #include typedef int64_t cuptiu_bitmap_t; typedef int (*cuptiu_dev_get_map_cb)(uint64_t event_id, int *dev_id); +typedef NVPW_CUDA_MetricsContext_Create_Params MCCP_t; typedef struct event_record_s { char name[PAPI_2MAX_STR_LEN]; @@ -20,11 +24,20 @@ typedef struct event_record_s { cuptiu_bitmap_t device_map; } cuptiu_event_t; +typedef struct gpu_record_s { + char chip_name[PAPI_MIN_STR_LEN]; + MCCP_t *pmetricsContextCreateParams; + int num_metrics; + const char* const* metric_names; +} gpu_record_t; + typedef struct event_table_s { - unsigned int count; + int count; unsigned int capacity; - char added_cuda_evts[30][PAPI_2MAX_STR_LEN]; - int added_cuda_dev[30]; + char cuda_evts[30][PAPI_2MAX_STR_LEN]; + int cuda_devs[30]; + int evt_pos[30]; + gpu_record_t *avail_gpu_info; cuptiu_event_t *events; void *htable; } cuptiu_event_table_t; diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c index 37e173864..35d153808 100644 --- a/src/components/cuda/linux-cuda.c +++ b/src/components/cuda/linux-cuda.c @@ -143,10 +143,9 @@ static int cuda_init_component(int cidx) _cuda_vector.cmp_info.num_native_events = -1; _cuda_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx; - //_cuda_vector.cmp_info.initialized = 1; _cuda_vector.cmp_info.disabled = PAPI_EDELAY_INIT; sprintf(_cuda_vector.cmp_info.disabled_reason, - "Not initialized. Access component events to initialize it."); + "Not initialized. Access component events to initialize it."); return PAPI_EDELAY_INIT; } @@ -172,12 +171,17 @@ static int cuda_init_private(void) _papi_hwi_lock(COMPONENT_LOCK); SUBDBG("ENTER\n"); + if (_cuda_vector.cmp_info.initialized) { + SUBDBG("Skipping cuda_init_private, as the Cuda event table has already been initialized.\n"); + goto fn_exit; + } + papi_errno = cuptid_init(); if (papi_errno != PAPI_OK) { /* get and assign the string literal for the disabled reason */ cuptid_disabled_reason_get(&disabled_reason); len = snprintf(_cuda_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", disabled_reason); - if (len > PAPI_MAX_STR_LEN) { + if (len < 0 || len > PAPI_MAX_STR_LEN) { SUBDBG("The disabled reason has been truncated.\n"); } goto fn_fail; @@ -204,7 +208,6 @@ static int check_n_initialize(void) if (!_cuda_vector.cmp_info.initialized) { return cuda_init_private(); } - return _cuda_vector.cmp_info.disabled; } @@ -215,7 +218,7 @@ static int cuda_ntv_enum_events(unsigned int *event_code, int modifier) if (papi_errno != PAPI_OK) { goto fn_exit; } - + uint64_t code = *(uint64_t *) event_code; papi_errno = cuptid_evt_enum(&code, modifier); *event_code = (unsigned int) code; @@ -233,7 +236,7 @@ static int cuda_ntv_name_to_code(const char *name, unsigned int *event_code) if (papi_errno != PAPI_OK) { goto fn_exit; } - + uint64_t code; papi_errno = cuptid_evt_name_to_code(name, &code); *event_code = (unsigned int) code; @@ -286,7 +289,7 @@ static int cuda_ntv_code_to_info(unsigned int event_code, PAPI_event_info_t *inf goto fn_fail; } - papi_errno = cuptid_evt_code_to_info((uint64_t) event_code, info); + papi_errno = cuptid_evt_code_to_info((uint64_t) event_code, info); fn_exit: SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno)); @@ -317,7 +320,7 @@ static int cuda_shutdown_thread(hwd_context_t *ctx) static int cuda_init_control_state(hwd_control_state_t __attribute__((unused)) *ctl) { COMPDBG("Entering.\n"); - return PAPI_OK; + return check_n_initialize(); } static int cuda_set_domain(hwd_control_state_t __attribute__((unused)) *ctrl, int domain) @@ -360,9 +363,10 @@ static int cuda_update_control_state(hwd_control_state_t *ctl, NativeInfo_t *ntv if (papi_errno != PAPI_OK) { goto fn_exit; } - + /* needed to make sure multipass events are caught with proper error code (PAPI_EMULPASS)*/ papi_errno = cuptid_ctx_create(cuda_ctl->info, &(cuda_ctl->cuptid_ctx), cuda_ctl->events_id, cuda_ctl->num_events); + fn_exit: SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno)); return papi_errno; @@ -403,8 +407,6 @@ int update_native_events(cuda_control_t *ctl, NativeInfo_t *ntv_info, sorted_events[i].frontend_idx = i; } - qsort(sorted_events, ntv_count, sizeof(struct event_map_item), compare); - for (i = 0; i < ntv_count; ++i) { ctl->events_id[i] = sorted_events[i].event_id; ntv_info[sorted_events[i].frontend_idx].ni_position = i; @@ -513,7 +515,6 @@ static int cuda_reset(hwd_context_t __attribute__((unused)) *ctx, hwd_control_st return PAPI_EMISC; } - /* To-do: Understand how this connects to values, memory addresses are not the same. */ papi_errno = cuptid_ctx_reset(cuda_ctl->cuptid_ctx); return papi_errno; diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c index f3aeb9e08..e11b6e2f5 100644 --- a/src/components/cuda/papi_cupti_common.c +++ b/src/components/cuda/papi_cupti_common.c @@ -607,45 +607,51 @@ int cuptic_ctxarr_create(cuptic_info_t *pinfo) * Struct that contains a Cuda context, that can be indexed into based * on device id. */ -int cuptic_ctxarr_update_current(cuptic_info_t info) +int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id) { - int gpu_id; CUcontext pctx; CUresult cuda_err; + CUdevice dev_id; - /* get device currently being used */ - cuda_err = cudaGetDevicePtr(&gpu_id); - if (cuda_err != cudaSuccess) { - return PAPI_EMISC; - } - - /* return cuda context bound to the calling CPU thread */ + // If a Cuda context already exists, get it cuda_err = cuCtxGetCurrentPtr(&pctx); - if (cuda_err != cudaSuccess) { + if (cuda_err != CUDA_SUCCESS) { return PAPI_EMISC; } - /* check to see if Cuda context exists for device */ - if (info[gpu_id].ctx == NULL) { - /* cuda context found for the calling CPU thread */ - if (pctx != NULL) { - LOGDBG("Registering device = %d with ctx = %p.\n", gpu_id, pctx); - /* store current context into struct */ - cuda_err = cuCtxGetCurrentPtr(&info[gpu_id].ctx); - if (cuda_err != cudaSuccess) + + // Cuda context was found + if (pctx != NULL) { //info[dev_id].ctx != NULL does not work + SUBDBG("A Cuda context was found.\n"); + // Get the device id associated with the user created CUDA context + cuda_err = cuCtxGetDevicePtr(&dev_id); + if (cuda_err != CUDA_SUCCESS) { + return PAPI_EMISC; + } + + if(info[dev_id].ctx == NULL) { + // Store current user created Cuda context + cuda_err = cuCtxGetCurrentPtr(&info[dev_id].ctx); + if (cuda_err != CUDA_SUCCESS) { return PAPI_EMISC; + } } - /* cuda context not found for calling CPU thread */ - else { - cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC); - cudaCheckErrors(cuCtxGetCurrentPtr(&info[gpu_id].ctx), return PAPI_EMISC); - LOGDBG("Using primary device context %p for device %d.\n", info[gpu_id].ctx, gpu_id); + else if (info[dev_id].ctx != pctx) { + ERRDBG("Warning: cuda context for device %d has changed from %p to %p\n", dev_id, info[dev_id].ctx, pctx); } } + // Cuda context was not found + // Note, that for machines with multiple devices, we need to + // call cudaSetDevice. + else { + // Guard against a user PopCurrent + if (info[evt_dev_id].ctx == NULL) { + SUBDBG("A Cuda context was not found. Therefore, one is created for: %d\n", evt_dev_id); + cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC); + cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC); - /* if context exists then see if it has changed; if it has then keep the first - seen one, but show warning */ - else if (info[gpu_id].ctx != pctx) { - ERRDBG("Warning: cuda context for gpu %d has changed from %p to %p\n", gpu_id, info[gpu_id].ctx, pctx); + cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC); + cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC); + } } return PAPI_OK; @@ -699,7 +705,7 @@ static int _devmask_events_get(cuptiu_event_table_t *evt_table, gpu_occupancy_t gpu_occupancy_t acq_mask = 0; cuptiu_event_t *evt_rec; for (i = 0; i < evt_table->count; i++) { - acq_mask |= (1 << evt_table->added_cuda_dev[i]); + acq_mask |= (1 << evt_table->cuda_devs[i]); } *bitmask = acq_mask; fn_exit: diff --git a/src/components/cuda/papi_cupti_common.h b/src/components/cuda/papi_cupti_common.h index 13a30828f..398d75267 100644 --- a/src/components/cuda/papi_cupti_common.h +++ b/src/components/cuda/papi_cupti_common.h @@ -65,7 +65,7 @@ int cuptic_shutdown(void); /* context management interfaces */ int cuptic_ctxarr_create(cuptic_info_t *pinfo); -int cuptic_ctxarr_update_current(cuptic_info_t info); +int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id); int cuptic_ctxarr_get_ctx(cuptic_info_t info, int gpu_idx, CUcontext *ctx); int cuptic_ctxarr_destroy(cuptic_info_t *pinfo);