From 6b817c9e54b0bacb63ec37578b3a2b1c8bf7d151 Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Fri, 3 Jan 2025 16:43:50 +0000
Subject: [PATCH 01/16] Multi gpu support

---
 src/components/cuda/cupti_profiler.c    | 210 ++++++++++++++++--------
 src/components/cuda/papi_cupti_common.c |   1 +
 2 files changed, 147 insertions(+), 64 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index 31bae0ca1..b7509aa9a 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -91,15 +91,15 @@ struct list_metrics_s {
     char chip_name[32];
     MCCP_t *pmetricsContextCreateParams;
     int num_metrics;
-    cuptiu_event_table_t *nv_metrics;
+    const char* const* metric_names;
+    cuptiu_event_table_t *cuptiu_table_p;
 };
 
 static void *dl_nvpw;
 static int num_gpus;
-static int num_unique_gpus = 1;
+//static int num_unique_gpus = 2;
 static list_metrics_t *avail_events;
 
-static cuptiu_event_table_t cuptiu_table;
 static cuptiu_event_table_t *cuptiu_table_p;
 
 /* load and unload cuda function pointers */
@@ -118,6 +118,7 @@ static int initialize_perfworks_api(void);
 /* utility functions to init metrics and cuda native event table */
 static int init_all_metrics(void);
 static int init_event_table(void);
+static void init_main_htable(void);
 static int shutdown_event_table(void);
 static void free_all_enumerated_metrics(void);
 
@@ -146,7 +147,8 @@ static int calculate_num_passes(struct NVPA_RawMetricsConfig *pRawMetricsConfig,
 
 /* functions to set and get cuda native event info  or convert cuda native events  */
 static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos, int gpu_id);
-static int verify_events(uint64_t *events_id, int num_events, cuptiu_event_table_t **targeted_event_names);
+//static int verify_events(uint64_t *events_id, int num_events, cuptiu_event_table_t **targeted_event_names);
+static int verify_events(uint64_t *events_id, int num_events, cuptip_control_t state);
 static int evt_id_to_info(uint64_t event_id, event_info_t *info);
 static int evt_id_create(event_info_t *info, uint64_t *event_id);
 static int evt_code_to_name(uint64_t event_code, char *name, int len);
@@ -631,7 +633,7 @@ static int nvpw_cuda_metricscontext_create(cuptip_control_t state)
        seee cuptip_gpu_state_s */
     cuptip_gpu_state_t *gpu_ctl;
 
-    for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) {
+    for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
         found = find_same_chipname(gpu_id);
         if (found > -1) {
@@ -677,7 +679,7 @@ static int nvpw_cuda_metricscontext_destroy(cuptip_control_t state)
     int gpu_id, found, papi_errno = PAPI_OK;
     cuptip_gpu_state_t *gpu_ctl;
 
-    for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) {
+    for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
         found = find_same_chipname(gpu_id);
         if (found > -1) {
@@ -714,7 +716,7 @@ static int check_multipass(cuptip_control_t state)
     NVPA_Status nvpa_err;
     cuptip_gpu_state_t *gpu_ctl;
 
-    for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) {
+    for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
         if (gpu_ctl->event_names->count == 0) {
             continue;
@@ -1297,19 +1299,19 @@ static int find_same_chipname(int gpu_id)
 static int init_all_metrics(void)
 {
     int gpu_id, papi_errno = PAPI_OK;
-    avail_events = (list_metrics_t *) papi_calloc(num_unique_gpus, sizeof(list_metrics_t));
+    avail_events = (list_metrics_t *) papi_calloc(num_gpus, sizeof(list_metrics_t));
     if (avail_events == NULL) {
         papi_errno = PAPI_ENOMEM;
         goto fn_exit;
     }
-    for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) {
+    for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         papi_errno = get_chip_name(gpu_id, avail_events[gpu_id].chip_name);
         if (papi_errno != PAPI_OK) {
             goto fn_exit;
         }
     }
     int found;
-    for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) {
+    for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         found = find_same_chipname(gpu_id);
         if (found > -1) {
             avail_events[gpu_id].pmetricsContextCreateParams = avail_events[found].pmetricsContextCreateParams;
@@ -1345,11 +1347,11 @@ static void free_all_enumerated_metrics(void)
     if (avail_events == NULL) {
         return;
     }
-    for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) {
+    for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         found = find_same_chipname(gpu_id);
         if (found > -1) {
             avail_events[gpu_id].num_metrics = 0;
-            avail_events[gpu_id].nv_metrics = NULL;
+            avail_events[gpu_id].cuptiu_table_p = NULL;
             avail_events[gpu_id].pmetricsContextCreateParams = NULL;
             continue;
         }
@@ -1364,14 +1366,29 @@ static void free_all_enumerated_metrics(void)
         papi_free(avail_events[gpu_id].pmetricsContextCreateParams);
         avail_events[gpu_id].pmetricsContextCreateParams = NULL;
 
-        if (avail_events[gpu_id].nv_metrics) {
-            cuptiu_event_table_destroy( &(avail_events[gpu_id].nv_metrics) );
+        if (avail_events[gpu_id].cuptiu_table_p) {
+            cuptiu_event_table_destroy( &(avail_events[gpu_id].cuptiu_table_p) );
         }
     }
     papi_free(avail_events);
     avail_events = NULL;
 }
 
+static void init_main_htable(void) 
+{
+    int htable_errno;
+
+    /* capacity is set to 2097152 as this is 
+       the maximum number of events we allow as of now */
+    cuptiu_table_p = papi_malloc(sizeof(cuptiu_event_table_t));
+    cuptiu_table_p->capacity = 2097152;
+    cuptiu_table_p->count = 0;
+
+    htable_init(&cuptiu_table_p->htable);
+
+    cuptiu_table_p->events = papi_calloc(2097152, sizeof(cuptiu_event_t)); 
+}
+
 /** @class cuptip_init
   * @brief Load and initialize API's.  
 */
@@ -1392,7 +1409,7 @@ int cuptip_init(void)
     if (papi_errno != PAPI_OK) {
         goto fn_fail;
     }
-   
+
     /* if no gpu's are found exit */
     if (num_gpus <= 0) {
         cuptic_disabled_reason_set("No GPUs found on system.");
@@ -1411,15 +1428,19 @@ int cuptip_init(void)
     if (papi_errno != PAPI_OK) {
         goto fn_fail;
     }
+
     papi_errno = cuInitPtr(0);
     if (papi_errno != CUDA_SUCCESS) {
         cuptic_disabled_reason_set("Failed to initialize CUDA driver API.");
         goto fn_fail;
     }
 
+
+    /* initialize main hash table to store entries */
+    init_main_htable(); 
+
     /* initialize hash table with cuda native events */
     init_event_table();
-    cuptiu_table_p = &cuptiu_table;
 
     return PAPI_OK;
 fn_fail:
@@ -1436,6 +1457,7 @@ int cuptip_init(void)
   * @param **targeted_event_names
   *   Event table to hold subset of user added events.
 */
+/*
 int verify_events(uint64_t *events_id, int num_events, 
                   cuptiu_event_table_t **targeted_event_names) 
 {
@@ -1469,6 +1491,42 @@ int verify_events(uint64_t *events_id, int num_events,
   fn_exit:                                                                            
     return papi_errno;  
 }
+*/
+int verify_events(uint64_t *events_id, int num_events, 
+                  cuptip_control_t state) 
+{
+    int papi_errno = PAPI_OK, i;
+    char name[PAPI_MAX_STR_LEN] = { 0 }; 
+
+    for (i = 0; i < num_gpus; i++) { 
+        papi_errno = cuptiu_event_table_create_init_capacity(
+                         num_events * num_gpus,
+                         sizeof(cuptiu_event_t), &(state->gpu_ctl[i].event_names)
+                     ); 
+        if (papi_errno != PAPI_OK) {
+            goto fn_exit;
+        }
+     }  
+
+    for (i = 0; i < num_events; i++) {
+        event_info_t info;
+        papi_errno = evt_id_to_info(events_id[i], &info);
+        if (papi_errno != PAPI_OK) {
+            break;
+        }    
+        sprintf(name, "%s", cuptiu_table_p->events[info.nameid].name);
+        strcpy(state->gpu_ctl[info.device].event_names->added_cuda_evts[i], name);
+        state->gpu_ctl[info.device].event_names->added_cuda_dev[i] = info.device;
+        void *p;
+        if (htable_find(cuptiu_table_p->htable, name, (void **) &p) != HTABLE_SUCCESS) {
+            htable_insert(state->gpu_ctl[info.device].event_names->htable, name, (void **) &p );
+        }    
+        state->gpu_ctl[info.device].event_names->count++;
+    }    
+
+  fn_exit:     
+    return papi_errno;  
+}
 
 /** @class cuptip_ctx_create
   * @brief Create a profiling context for the requested Cuda events.
@@ -1489,10 +1547,10 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
     char name[PAPI_2MAX_STR_LEN] = { 0 };
     cuptiu_event_table_t *targeted_event_names;
 
-    papi_errno = verify_events(events_id, num_events, &targeted_event_names);
-    if (papi_errno != PAPI_OK) {
-        return papi_errno;
-    }
+    //papi_errno = verify_events(events_id, num_events, &targeted_event_names);
+    //if (papi_errno != PAPI_OK) {
+    //    return papi_errno;
+    //}
 
     /* create a cuptip_control_t struct which contains read_count, running, cupti_info_t and cuptip_gpu_state_t */
     cuptip_control_t state = (cuptip_control_t) papi_calloc (1, sizeof(struct cuptip_control_s));
@@ -1502,7 +1560,7 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
 
     /* allocate memory for the total number of gpus for the cuptip_gpu_state_t struct 
        with the device qualifier refactor we only want to count the total number of unique gpus */
-    state->gpu_ctl = (cuptip_gpu_state_t *) papi_calloc(num_unique_gpus, sizeof(cuptip_gpu_state_t));
+    state->gpu_ctl = (cuptip_gpu_state_t *) papi_calloc(num_gpus, sizeof(cuptip_gpu_state_t));
     if (state->gpu_ctl == NULL) {
         return PAPI_ENOMEM;
     }
@@ -1510,9 +1568,9 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
     counters = papi_malloc(num_events * sizeof(*counters));
 
     /* for each unique gpu store the gpu id for that gpu index */
-    for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) {
+    for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         state->gpu_ctl[gpu_id].gpu_id = gpu_id;
-        state->gpu_ctl[gpu_id].event_names = targeted_event_names;
+        //state->gpu_ctl[gpu_id].event_names = targeted_event_names;
     }
 
     /* register the user created cuda context for the current gpu if not already known */
@@ -1527,6 +1585,11 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
         goto fn_exit;
     }
 
+    papi_errno = verify_events(events_id, num_events, state);
+    if (papi_errno != PAPI_OK) {
+        goto fn_exit;
+    }
+
     /* multipass is not supporter; therefore, we must check the Cuda native event */
     papi_errno = check_multipass(state);
     if (papi_errno != PAPI_OK) {
@@ -1565,7 +1628,8 @@ int cuptip_ctx_start(cuptip_control_t state)
     }
 
     /* enumerate through all of the unique gpus */
-    for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) {
+    for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
+        printf("gpu_id is: %d\n", gpu_id);
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
         if (gpu_ctl->event_names->count == 0) {
             continue;
@@ -1593,12 +1657,14 @@ int cuptip_ctx_start(cuptip_control_t state)
         papi_errno += metric_get_counter_data_prefix_image(gpu_ctl);
         papi_errno += create_counter_data_image(gpu_ctl);
         if (papi_errno != PAPI_OK) {
+            printf("cupti profiler host configuration.\n");
             ERRDBG("Failed to create CUPTI profiler state for gpu %d\n", gpu_id);
             goto fn_fail;
         }
 
         papi_errno = begin_profiling(gpu_ctl);
         if (papi_errno != PAPI_OK) {
+            printf("begin profiling failed.\n");
             ERRDBG("Failed to start profiling for gpu %d\n", gpu_id);
             goto fn_fail;
         }
@@ -1638,7 +1704,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
         cudaArtCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc );
     }
 
-    for (gpu_id = 0; gpu_id < num_unique_gpus; gpu_id++) {
+    for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
         if (gpu_ctl->event_names->count == 0) {
             continue;
@@ -1784,7 +1850,7 @@ int cuptip_ctx_stop(cuptip_control_t state)
         cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc );
     }
 
-    for (gpu_id=0; gpu_id < num_unique_gpus; gpu_id++) {
+    for (gpu_id=0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
         if (gpu_ctl->event_names->count == 0) {
             continue;
@@ -1823,7 +1889,7 @@ int cuptip_ctx_destroy(cuptip_control_t *pstate)
     cuptip_control_t state = *pstate;
     int i, j;
     int papi_errno = nvpw_cuda_metricscontext_destroy(state);
-    for (i = 0; i < num_unique_gpus; i++) {
+    for (i = 0; i < num_gpus; i++) {
         reset_cupti_prof_config_images( &(state->gpu_ctl[i]) );
         cuptiu_event_table_destroy( &(state->gpu_ctl[i].event_names) );
         for (j = 0; j < state->gpu_ctl[i].rmr_count; j++) {
@@ -1931,49 +1997,64 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info)
 */
 int init_event_table(void) 
 {
-    int gpu_idx, dev_id, i, listsubmetrics = 1, papi_errno = PAPI_OK;
-
-    for (gpu_idx = 0; gpu_idx < num_unique_gpus; gpu_idx++) {
-        NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = {
-            .structSize = NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE,
-            .pPriv = NULL,
-            .pMetricsContext = avail_events[gpu_idx].pmetricsContextCreateParams->pMetricsContext,
-            .hidePeakSubMetrics = !listsubmetrics,
-            .hidePerCycleSubMetrics = !listsubmetrics,
-            .hidePctOfPeakSubMetrics = !listsubmetrics,
-        };
-        nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_BeginPtr(&getMetricNameBeginParams), goto fn_fail );
-
-        avail_events[gpu_idx].num_metrics = getMetricNameBeginParams.numMetrics;
-        cuptiu_table.events = papi_calloc(avail_events[gpu_idx].num_metrics, sizeof(cuptiu_event_t));
-        if (cuptiu_table.events == NULL) {
-            papi_errno = PAPI_ENOMEM;
-            goto fn_fail;
+    int dev_id, found, table_idx = 0; 
+    int gpu_idx, i, listsubmetrics = 1, papi_errno = PAPI_OK;
+
+    /* instatiate struct to collect the total metric count and metric names;
+       instantiated here to avoid scoping issues */
+    NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = { NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE };
+
+    /* loop through all available devices on the current system */
+    for (dev_id = 0; dev_id < num_gpus; dev_id++) {
+        found = find_same_chipname(dev_id);
+        /* unique device found, collect metadata  */
+        if (found == -1) {
+            /* increment table index */
+            if (dev_id > 0)
+                table_idx++;
+
+            /* assigning values to member variables */
+            getMetricNameBeginParams.pPriv = NULL;
+            getMetricNameBeginParams.pMetricsContext = avail_events[table_idx].pmetricsContextCreateParams->pMetricsContext;
+            getMetricNameBeginParams.hidePeakSubMetrics = !listsubmetrics;
+            getMetricNameBeginParams.hidePerCycleSubMetrics = !listsubmetrics;
+            getMetricNameBeginParams.hidePctOfPeakSubMetrics = !listsubmetrics;
+
+            nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_BeginPtr(&getMetricNameBeginParams), goto fn_fail ); 
+
+            /* for each unique device found, store both the total number of metrics and metric names */
+            avail_events[table_idx].num_metrics = getMetricNameBeginParams.numMetrics;
+            avail_events[table_idx].metric_names = getMetricNameBeginParams.ppMetricNames; 
+
+            papi_errno = cuptiu_event_table_create_init_capacity(avail_events[table_idx].num_metrics * num_gpus, sizeof(cuptiu_event_t), &(avail_events[table_idx].cuptiu_table_p));
+            if (papi_errno != PAPI_OK) {
+                goto fn_exit;
+            }
+            avail_events[table_idx].cuptiu_table_p->events = papi_calloc(avail_events[table_idx].num_metrics, sizeof(cuptiu_event_t));
         }
-        
-        papi_errno = cuptiu_event_table_create_init_capacity(avail_events[gpu_idx].num_metrics * num_gpus, sizeof(cuptiu_event_t), &(avail_events[gpu_idx].nv_metrics));
-        if (papi_errno != PAPI_OK) {
-            goto fn_exit;
+        /* device metadata already collected, set table index */
+        else {
+            /* set table_idx to */
+            table_idx = found;
         }
-        for (dev_id = 0; dev_id < num_gpus; dev_id++) {
-            for (i = 0; i < avail_events[gpu_idx].num_metrics; i++) {
-                papi_errno = get_ntv_events( avail_events[gpu_idx].nv_metrics,
-                                             getMetricNameBeginParams.ppMetricNames[i],
-                                             i, 0, dev_id );
-                if (papi_errno != PAPI_OK) {
-                    goto fn_exit;
-                }
-            }
+     
+        /* loop through metrics to add to overall event table */
+        for (i = 0; i < avail_events[table_idx].num_metrics; i++) {
+            papi_errno = get_ntv_events( cuptiu_table_p, avail_events[table_idx].metric_names[i], i, 0, dev_id);
+            //papi_errno = get_ntv_events( avail_events[0].cuptiu_table_p, getMetricNameBeginParams.ppMetricNames[i], i, 0, dev_id);
+            if (papi_errno != PAPI_OK)
+                goto fn_exit;
         }
-        cuptiu_table.events = papi_realloc(cuptiu_table.events, avail_events[gpu_idx].nv_metrics->count * sizeof(cuptiu_event_t));
-        cuptiu_table.count = avail_events[gpu_idx].nv_metrics->count;
-        cuptiu_table.htable = avail_events[gpu_idx].nv_metrics->htable;
 
+    }
+
+    /* free memory */
+    for (i = 0; i < table_idx; i++) {
         NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = {
             .structSize = NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE,
             .pPriv = NULL,
             .pMetricsContext = avail_events[gpu_idx].pmetricsContextCreateParams->pMetricsContext,
-        };
+        };   
         nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_EndPtr((NVPW_MetricsContext_GetMetricNames_End_Params *) &getMetricNameEndParams), goto fn_fail );
     }
 
@@ -2002,8 +2083,9 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name,
     int papi_errno;
     char description[256];
     int *count = &evt_table->count;
-    cuptiu_event_t *events = cuptiu_table.events;
-    
+    //cuptiu_event_t *events = cuptiu_table.events;
+    cuptiu_event_t *events = evt_table->events;    
+
     /* check to see if evt_name argument has been provided */
     if (evt_name == NULL) {
         return PAPI_EINVAL;
diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c
index f3aeb9e08..f378418af 100644
--- a/src/components/cuda/papi_cupti_common.c
+++ b/src/components/cuda/papi_cupti_common.c
@@ -714,6 +714,7 @@ int cuptic_device_acquire(cuptiu_event_table_t *evt_table)
         return papi_errno;
     }
     if (bitmask & global_gpu_bitmask) {
+        printf("We fail inside here.\n");
         return PAPI_ECNFLCT;
     }
     _papi_hwi_lock(_cuda_lock);

From 155a81190ea7566c363121d5e141d4b482b43dbb Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Tue, 7 Jan 2025 03:15:18 +0000
Subject: [PATCH 02/16] Support for multi-gpus tested on system with H100 and
 V100.

---
 src/components/cuda/cupti_profiler.c | 48 ++++++++++++++++++----------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index b7509aa9a..c72973c4b 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -97,7 +97,6 @@ struct list_metrics_s {
 
 static void *dl_nvpw;
 static int num_gpus;
-//static int num_unique_gpus = 2;
 static list_metrics_t *avail_events;
 
 static cuptiu_event_table_t *cuptiu_table_p;
@@ -155,7 +154,7 @@ static int evt_code_to_name(uint64_t event_code, char *name, int len);
 static int evt_name_to_basename(const char *name, char *base, int len);
 static int evt_name_to_device(const char *name, int *device);
 static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const char *evt_name,
-                                  char *description, int gpu_id );
+                                  char *description, const char *chip_name );
 static int retrieve_metric_rmr( NVPA_MetricsContext *pMetricsContext, const char *evt_name,
                                 int *numDep, NVPA_RawMetricRequest **pRMR );
 
@@ -164,6 +163,7 @@ static int get_event_collection_method(const char *evt_name);
 static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl);
 static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl);
 static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts);
+static int num_unique_devs(int num_gpus);
 
 /* nvperf function pointers */
 NVPA_Status ( *NVPW_GetSupportedChipNamesPtr ) (NVPW_GetSupportedChipNames_Params* params);
@@ -1374,19 +1374,27 @@ static void free_all_enumerated_metrics(void)
     avail_events = NULL;
 }
 
+/** @class init_main_htable
+  * @brief Initialize the main htable that will 
+  *        store the metric info for all devices..  
+*/
 static void init_main_htable(void) 
 {
-    int htable_errno;
+    int htable_errno, i, val = 1, base = 2;
+
+    /* get the total number of possible metrics to allocate for,
+       as of now we allow only 2^NAMEID_WIDTH metrics */
+    for (i = 0; i < NAMEID_WIDTH; i++) {
+        val *= base;
+    }
 
-    /* capacity is set to 2097152 as this is 
-       the maximum number of events we allow as of now */
     cuptiu_table_p = papi_malloc(sizeof(cuptiu_event_table_t));
-    cuptiu_table_p->capacity = 2097152;
+    cuptiu_table_p->capacity = val;
     cuptiu_table_p->count = 0;
 
     htable_init(&cuptiu_table_p->htable);
 
-    cuptiu_table_p->events = papi_calloc(2097152, sizeof(cuptiu_event_t)); 
+    cuptiu_table_p->events = papi_calloc(val, sizeof(cuptiu_event_t)); 
 }
 
 /** @class cuptip_init
@@ -1415,7 +1423,7 @@ int cuptip_init(void)
         cuptic_disabled_reason_set("No GPUs found on system.");
         goto fn_fail;
     }
-    
+   
     /* initialize cupti profiler and perfworks api */
     papi_errno = initialize_cupti_profiler_api();
     papi_errno += initialize_perfworks_api();
@@ -1435,7 +1443,6 @@ int cuptip_init(void)
         goto fn_fail;
     }
 
-
     /* initialize main hash table to store entries */
     init_main_htable(); 
 
@@ -1590,7 +1597,7 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
         goto fn_exit;
     }
 
-    /* multipass is not supporter; therefore, we must check the Cuda native event */
+    /* multipass is not supported; therefore, we must check the Cuda native event */
     papi_errno = check_multipass(state);
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
@@ -1629,7 +1636,6 @@ int cuptip_ctx_start(cuptip_control_t state)
 
     /* enumerate through all of the unique gpus */
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
-        printf("gpu_id is: %d\n", gpu_id);
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
         if (gpu_ctl->event_names->count == 0) {
             continue;
@@ -2145,7 +2151,7 @@ static int shutdown_event_table(void)
   * @param gpu_id
   *   Device number, e.g. 0, 1, 2, ... ,etc.
 */
-static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const char *evt_name, char *description, int gpu_id) 
+static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const char *evt_name, char *description, const char *chip_name) 
 {
     COMPDBG("Entering.\n");
     int num_dep, i, len, passes, papi_errno;
@@ -2216,7 +2222,7 @@ static int retrieve_metric_descr( NVPA_MetricsContext *pMetricsContext, const ch
         .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE,
         .pPriv = NULL, // assign to NULL
         .activityKind = NVPA_ACTIVITY_KIND_PROFILER,
-        .pChipName = avail_events[gpu_id].chip_name,
+        .pChipName = chip_name,
     };
     nvpa_err = NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams);
     if (nvpa_err != NVPA_STATUS_SUCCESS) {
@@ -2541,9 +2547,11 @@ static int evt_code_to_name(uint64_t event_code, char *name, int len)
 int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
 {
 
-    int papi_errno, len, gpu_id;
+    int papi_errno, len, i, gpu_id;
     event_info_t inf;
     char description[PAPI_HUGE_STR_LEN];
+
+    /* get event code info */
     papi_errno = evt_id_to_info(event_code, &inf);
     if (papi_errno != PAPI_OK) {
         return papi_errno;
@@ -2551,8 +2559,15 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
 
     /* collect the description and calculated numpass for a specific nameid */
     if (cuptiu_table_p->events[inf.nameid].desc[0] == 0) {
-        papi_errno = retrieve_metric_descr( avail_events[0].pmetricsContextCreateParams->pMetricsContext,
-                                            cuptiu_table_p->events[inf.nameid].name, cuptiu_table_p->events[inf.nameid].desc, 0 );
+        for (i = 0; i < num_gpus; ++i) {
+            if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) {
+                gpu_id = i;
+                break;
+            }
+        }
+        papi_errno = retrieve_metric_descr( avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext,
+                                            cuptiu_table_p->events[inf.nameid].name, cuptiu_table_p->events[inf.nameid].desc,
+                                            avail_events[gpu_id].pmetricsContextCreateParams->pChipName );
         if (papi_errno != PAPI_OK) {
             return papi_errno;
         }
@@ -2569,7 +2584,6 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
             break;
         case DEVICE_FLAG:
         {
-            int i;
             char devices[PAPI_MAX_STR_LEN] = { 0 };
             for (i = 0; i < num_gpus; ++i) {
                 if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) {

From 41832798e58cea271402c0c26ba685a9c58a3339 Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Tue, 7 Jan 2025 03:17:46 +0000
Subject: [PATCH 03/16] Revert change in papi_cupti_common.c

---
 src/components/cuda/papi_cupti_common.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c
index f378418af..f3aeb9e08 100644
--- a/src/components/cuda/papi_cupti_common.c
+++ b/src/components/cuda/papi_cupti_common.c
@@ -714,7 +714,6 @@ int cuptic_device_acquire(cuptiu_event_table_t *evt_table)
         return papi_errno;
     }
     if (bitmask & global_gpu_bitmask) {
-        printf("We fail inside here.\n");
         return PAPI_ECNFLCT;
     }
     _papi_hwi_lock(_cuda_lock);

From 2aabc4bbe375982064b3fb3b23124f9586871ae0 Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Tue, 7 Jan 2025 15:15:36 +0000
Subject: [PATCH 04/16] Code cleanup

---
 src/components/cuda/cupti_profiler.c | 75 ++++++++--------------------
 1 file changed, 21 insertions(+), 54 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index c72973c4b..a340ea499 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -825,6 +825,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pChipName = avail_events[gpu_ctl->gpu_id].chip_name,
     };
     nvpwCheckErrors( NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams), goto fn_fail );
+    printf("Made it past first nvpwCheckErrors.\n");
 
     if( gpu_ctl->counterAvailabilityImage.data != NULL) {
         NVPW_RawMetricsConfig_SetCounterAvailability_Params setCounterAvailabilityParams = {
@@ -834,6 +835,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
             .pCounterAvailabilityImage = gpu_ctl->counterAvailabilityImage.data,
         };
         nvpwCheckErrors( NVPW_RawMetricsConfig_SetCounterAvailabilityPtr(&setCounterAvailabilityParams), goto fn_fail );
+        printf("Made it pass second nvpwCheckErrors.\n");
     };
 
     /* NOTE: maxPassCount is being set to 1 as a final safety net to limit metric collection to a single pass.
@@ -847,6 +849,8 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .maxPassCount = 1,
     };
     nvpwCheckErrors( NVPW_RawMetricsConfig_BeginPassGroupPtr(&beginPassGroupParams), goto fn_fail );
+    printf("Made it pass third nvpwCheckErrors.\n");
+
 
     NVPW_RawMetricsConfig_AddMetrics_Params addMetricsParams = {
         .structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE,
@@ -855,7 +859,13 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pRawMetricRequests = gpu_ctl->rmr,
         .numMetricRequests = gpu_ctl->rmr_count,
     };
-    nvpwCheckErrors( NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams), goto fn_fail );
+    //nvpwCheckErrors( NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams), goto fn_fail );
+    NVPA_Status _status = NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams);
+    if (_status != NVPA_STATUS_SUCCESS) {
+        printf("Failed with status: %d\n", _status);
+    } 
+    printf("Made it pass fourth nvpwCheckErrors.\n");
+
 
     NVPW_RawMetricsConfig_EndPassGroup_Params endPassGroupParams = {
         .structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE,
@@ -863,6 +873,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
     };
     nvpwCheckErrors( NVPW_RawMetricsConfig_EndPassGroupPtr(&endPassGroupParams), goto fn_fail );
+    printf("Made it past fifth nvpwCheckErrors.\n");
 
     NVPW_RawMetricsConfig_GenerateConfigImage_Params generateConfigImageParams = {
         .structSize = NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE,
@@ -870,6 +881,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
     };
     nvpwCheckErrors( NVPW_RawMetricsConfig_GenerateConfigImagePtr(&generateConfigImageParams), goto fn_fail );
+    printf("Made it past sixth nvpwCheckErrors.\n");
 
     NVPW_RawMetricsConfig_GetConfigImage_Params getConfigImageParams = {
         .structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE,
@@ -879,6 +891,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pBuffer = NULL,
     };
     nvpwCheckErrors( NVPW_RawMetricsConfig_GetConfigImagePtr(&getConfigImageParams), goto fn_fail );
+    printf("Made it past seventh nvpwCheckErrors.\n");
 
     gpu_ctl->configImage.size = getConfigImageParams.bytesCopied;
     gpu_ctl->configImage.data = (uint8_t *) papi_calloc(gpu_ctl->configImage.size, sizeof(uint8_t));
@@ -890,6 +903,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
     getConfigImageParams.bytesAllocated = gpu_ctl->configImage.size;
     getConfigImageParams.pBuffer = gpu_ctl->configImage.data;
     nvpwCheckErrors( NVPW_RawMetricsConfig_GetConfigImagePtr(&getConfigImageParams), goto fn_fail );
+    printf("Made it past eigth nvpwCheckErrors.\n");
 
     NVPW_RawMetricsConfig_Destroy_Params rawMetricsConfigDestroyParams = {
         .structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE,
@@ -897,6 +911,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
     };
     nvpwCheckErrors( NVPW_RawMetricsConfig_DestroyPtr((NVPW_RawMetricsConfig_Destroy_Params *) &rawMetricsConfigDestroyParams), goto fn_fail );
+    printf("Made it past ninth nvpwCheckErrors.\n");
 
     return PAPI_OK;
 fn_fail:
@@ -1454,51 +1469,6 @@ int cuptip_init(void)
     return PAPI_EMISC;
 }
 
-/** @class verify_events
-  * @brief Verify user added events and create a subset table to be used for 
-  *        start, stop, etc.
-  * @param *events_id
-  *   Cuda native event id's.
-  * @param num_events
-  *   Number of Cuda native events a user is wanting to count.
-  * @param **targeted_event_names
-  *   Event table to hold subset of user added events.
-*/
-/*
-int verify_events(uint64_t *events_id, int num_events, 
-                  cuptiu_event_table_t **targeted_event_names) 
-{
-    int papi_errno = PAPI_OK, i;
-    char name[PAPI_MAX_STR_LEN] = { 0 };
-     
-    papi_errno = cuptiu_event_table_create_init_capacity(
-                     num_events * num_gpus,
-                     sizeof(cuptiu_event_t), targeted_event_names
-                 );
-    if (papi_errno != PAPI_OK) {
-        goto fn_exit;
-    }
-
-    for (i = 0; i < num_events; i++) {
-        event_info_t info;
-        papi_errno = evt_id_to_info(events_id[i], &info);
-        if (papi_errno != PAPI_OK) {
-            break;
-        }
-        sprintf(name, "%s", cuptiu_table_p->events[info.nameid].name);
-        strcpy((*targeted_event_names)->added_cuda_evts[i], name);
-        (*targeted_event_names)->added_cuda_dev[i] = info.device;
-        void *p;
-        if (htable_find(cuptiu_table_p->htable, name, (void **) &p) != HTABLE_SUCCESS) {
-            htable_insert((*targeted_event_names)->htable, name, (void **) &p );
-        }
-        (*targeted_event_names)->count++;
-    }
-
-  fn_exit:                                                                            
-    return papi_errno;  
-}
-*/
 int verify_events(uint64_t *events_id, int num_events, 
                   cuptip_control_t state) 
 {
@@ -1554,11 +1524,6 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
     char name[PAPI_2MAX_STR_LEN] = { 0 };
     cuptiu_event_table_t *targeted_event_names;
 
-    //papi_errno = verify_events(events_id, num_events, &targeted_event_names);
-    //if (papi_errno != PAPI_OK) {
-    //    return papi_errno;
-    //}
-
     /* create a cuptip_control_t struct which contains read_count, running, cupti_info_t and cuptip_gpu_state_t */
     cuptip_control_t state = (cuptip_control_t) papi_calloc (1, sizeof(struct cuptip_control_s));
     if (state == NULL) {
@@ -1660,8 +1625,11 @@ int cuptip_ctx_start(cuptip_control_t state)
 
         /* CUPTI profiler host configuration */
         papi_errno = metric_get_config_image(gpu_ctl);
+        printf("papi_errno first: %d\n", papi_errno);
         papi_errno += metric_get_counter_data_prefix_image(gpu_ctl);
+        printf("papi_errno second: %d\n", papi_errno);
         papi_errno += create_counter_data_image(gpu_ctl);
+         printf("papi_errno third: %d\n", papi_errno);
         if (papi_errno != PAPI_OK) {
             printf("cupti profiler host configuration.\n");
             ERRDBG("Failed to create CUPTI profiler state for gpu %d\n", gpu_id);
@@ -2047,7 +2015,6 @@ int init_event_table(void)
         /* loop through metrics to add to overall event table */
         for (i = 0; i < avail_events[table_idx].num_metrics; i++) {
             papi_errno = get_ntv_events( cuptiu_table_p, avail_events[table_idx].metric_names[i], i, 0, dev_id);
-            //papi_errno = get_ntv_events( avail_events[0].cuptiu_table_p, getMetricNameBeginParams.ppMetricNames[i], i, 0, dev_id);
             if (papi_errno != PAPI_OK)
                 goto fn_exit;
         }
@@ -2060,7 +2027,7 @@ int init_event_table(void)
             .structSize = NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE,
             .pPriv = NULL,
             .pMetricsContext = avail_events[gpu_idx].pmetricsContextCreateParams->pMetricsContext,
-        };   
+        };
         nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_EndPtr((NVPW_MetricsContext_GetMetricNames_End_Params *) &getMetricNameEndParams), goto fn_fail );
     }
 
@@ -2089,7 +2056,6 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name,
     int papi_errno;
     char description[256];
     int *count = &evt_table->count;
-    //cuptiu_event_t *events = cuptiu_table.events;
     cuptiu_event_t *events = evt_table->events;    
 
     /* check to see if evt_name argument has been provided */
@@ -2559,6 +2525,7 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
 
     /* collect the description and calculated numpass for a specific nameid */
     if (cuptiu_table_p->events[inf.nameid].desc[0] == 0) {
+        /* find a matching device id to get correct MetricsContext and chip name */
         for (i = 0; i < num_gpus; ++i) {
             if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) {
                 gpu_id = i;

From 40516226d4956cb9d46b049e883a7249dddb2385 Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Fri, 17 Jan 2025 14:46:49 +0000
Subject: [PATCH 05/16] Updates to support multi-gpu for Cuda component.

---
 src/components/cuda/cupti_profiler.c | 194 +++++++++++++--------------
 src/components/cuda/cupti_utils.h    |  13 ++
 2 files changed, 104 insertions(+), 103 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index a340ea499..d29bb3340 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -47,9 +47,7 @@
 
 typedef struct byte_array_s         byte_array_t;
 typedef struct cuptip_gpu_state_s   cuptip_gpu_state_t;
-typedef struct list_metrics_s       list_metrics_t;
 typedef struct NVPA_MetricsContext  NVPA_MetricsContext;
-typedef NVPW_CUDA_MetricsContext_Create_Params MCCP_t;
 
 typedef struct {
     int device;
@@ -65,7 +63,7 @@ struct byte_array_s {
 
 struct cuptip_gpu_state_s {
     int                    gpu_id;
-    cuptiu_event_table_t  *event_names;
+    cuptiu_event_table_t  *targeted_events;
     int                    rmr_count;
     NVPA_RawMetricRequest *rmr;
     MCCP_t                *pmetricsContextCreateParams;
@@ -87,17 +85,9 @@ struct cuptip_control_s {
     cuptic_info_t       info;
 };
 
-struct list_metrics_s {
-    char chip_name[32];
-    MCCP_t *pmetricsContextCreateParams;
-    int num_metrics;
-    const char* const* metric_names;
-    cuptiu_event_table_t *cuptiu_table_p;
-};
-
 static void *dl_nvpw;
 static int num_gpus;
-static list_metrics_t *avail_events;
+static gpu_record_t *avail_gpu_info;
 
 static cuptiu_event_table_t *cuptiu_table_p;
 
@@ -116,8 +106,8 @@ static int initialize_perfworks_api(void);
 
 /* utility functions to init metrics and cuda native event table */
 static int init_all_metrics(void);
-static int init_event_table(void);
 static void init_main_htable(void);
+static int init_event_table(void);
 static int shutdown_event_table(void);
 static void free_all_enumerated_metrics(void);
 
@@ -146,7 +136,6 @@ static int calculate_num_passes(struct NVPA_RawMetricsConfig *pRawMetricsConfig,
 
 /* functions to set and get cuda native event info  or convert cuda native events  */
 static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos, int gpu_id);
-//static int verify_events(uint64_t *events_id, int num_events, cuptiu_event_table_t **targeted_event_names);
 static int verify_events(uint64_t *events_id, int num_events, cuptip_control_t state);
 static int evt_id_to_info(uint64_t event_id, event_info_t *info);
 static int evt_id_create(event_info_t *info, uint64_t *event_id);
@@ -160,7 +149,7 @@ static int retrieve_metric_rmr( NVPA_MetricsContext *pMetricsContext, const char
 
 /* misc */
 static int get_event_collection_method(const char *evt_name);
-static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl);
+static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl);
 static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl);
 static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts);
 static int num_unique_devs(int num_gpus);
@@ -470,7 +459,6 @@ static int initialize_perfworks_api(void)
     return PAPI_OK;
 }
 
-
 static int get_chip_name(int dev_num, char* chipName)
 {
     int papi_errno;
@@ -489,7 +477,7 @@ static int get_chip_name(int dev_num, char* chipName)
     return PAPI_OK;
 }
 
-/** @class get_event_names_rmr
+/** @class get_targeted_events_rmr
   * @brief For a Cuda native event name collect raw metrics and count
   *        of raw metrics for collection. Raw Metrics are one layer of the Metric API
   *        and contains the list of raw counters and generates configuration file
@@ -499,7 +487,7 @@ static int get_chip_name(int dev_num, char* chipName)
   *   Structure of type cuptip_gpu_state_t which has member variables such as 
   *   gpu_id, rmr, rmr_count, and more.
 */
-static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl)
+static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl)
 {
     COMPDBG("Entering.\n");
     int gpu_id, num_dep, count_raw_metrics = 0, papi_errno = PAPI_OK;
@@ -508,11 +496,11 @@ static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl)
     cuptiu_event_t *evt_rec;
 
     /* for each event in the event table collect the raw metric requests */
-    for (i = 0; i < gpu_ctl->event_names->count; i++) {
+    for (i = 0; i < gpu_ctl->targeted_events->count; i++) {
         /* Not using the correct global event names now.*/
         papi_errno = retrieve_metric_rmr(
                          gpu_ctl->pmetricsContextCreateParams->pMetricsContext,
-                         gpu_ctl->event_names->added_cuda_evts[i], &num_dep, 
+                         gpu_ctl->targeted_events->added_cuda_evts[i], &num_dep, 
                          &collect_rmr
                      );
         /* why is PAPI_ENOEVNT hard coded? */
@@ -651,7 +639,7 @@ static int nvpw_cuda_metricscontext_create(cuptip_control_t state)
         
         /* setting metadata values */
         pMCCP->structSize = NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE;
-        pMCCP->pChipName = avail_events[gpu_id].chip_name;
+        pMCCP->pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name;
 
         /* create context */
         nvpa_err = NVPW_CUDA_MetricsContext_CreatePtr(pMCCP);
@@ -718,11 +706,11 @@ static int check_multipass(cuptip_control_t state)
 
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
-        if (gpu_ctl->event_names->count == 0) {
+        if (gpu_ctl->targeted_events->count == 0) {
             continue;
         }
 
-        papi_errno = get_event_names_rmr(gpu_ctl);
+        papi_errno = get_targeted_events_rmr(gpu_ctl);
         if (papi_errno != PAPI_OK) {
             goto fn_exit;
         }
@@ -732,7 +720,7 @@ static int check_multipass(cuptip_control_t state)
             .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE,
             .pPriv = NULL,
             .activityKind = NVPA_ACTIVITY_KIND_PROFILER,
-            .pChipName = avail_events[gpu_id].chip_name,
+            .pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name,
         };
         nvpa_err = NVPW_CUDA_RawMetricsConfig_CreatePtr(
                        &nvpw_metricsConfigCreateParams
@@ -809,7 +797,7 @@ static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl)
 
 /** @class metric_get_config_image
   * @brief Retrieves binary ConfigImage for the Cuda native event metrics listed 
-  *        for collection. The function get_event_names_rmr( ... ) must be 
+  *        for collection. The function get_targeted_events_rmr( ... ) must be 
   *        called before this step is possible. 
   * @param *gpu_ctl
   *   Structure of type cuptip_gpu_state_t which has member variables such as 
@@ -822,7 +810,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE,
         .pPriv = NULL,
         .activityKind = NVPA_ACTIVITY_KIND_PROFILER,
-        .pChipName = avail_events[gpu_ctl->gpu_id].chip_name,
+        .pChipName = cuptiu_table_p->avail_gpu_info[gpu_ctl->gpu_id].chip_name,
     };
     nvpwCheckErrors( NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams), goto fn_fail );
     printf("Made it past first nvpwCheckErrors.\n");
@@ -920,7 +908,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
 
 /** @class metric_get_counter_data_prefix_image
   * @brief Retrieves binary CounterDataPrefix for the Cuda native event metrics 
-  *        listed for collection. The function get_event_names_rmr( ... ) 
+  *        listed for collection. The function get_targeted_events_rmr( ... ) 
   *        must be called before this step is possible. 
   * @param *gpu_ctl
   *   Structure of type cuptip_gpu_state_t which has member variables such as 
@@ -932,7 +920,7 @@ static int metric_get_counter_data_prefix_image(cuptip_gpu_state_t *gpu_ctl)
     NVPW_CounterDataBuilder_Create_Params counterDataBuilderCreateParams = {
         .structSize = NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE,
         .pPriv = NULL,
-        .pChipName = avail_events[gpu_ctl->gpu_id].chip_name,
+        .pChipName = cuptiu_table_p->avail_gpu_info[gpu_ctl->gpu_id].chip_name,
     };
     nvpwCheckErrors( NVPW_CounterDataBuilder_CreatePtr(&counterDataBuilderCreateParams), goto fn_fail );
 
@@ -1224,7 +1212,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts)
 {
     COMPDBG("eval_metric_values. dev = %d\n", gpu_ctl->gpu_id);
     int i, papi_errno = PAPI_OK;
-    int numMetrics = gpu_ctl->event_names->count;
+    int numMetrics = gpu_ctl->targeted_events->count;
     double *gpuValues;
     char **metricNames;
 
@@ -1248,7 +1236,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts)
     }    
 
     for (i = 0; i < numMetrics; i++) {
-        metricNames[i] = gpu_ctl->event_names->added_cuda_evts[i];
+        metricNames[i] = gpu_ctl->targeted_events->added_cuda_evts[i];
         LOGDBG("Setting metric name %s\n", metricNames[i]);
     }
 
@@ -1275,7 +1263,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts)
     nvpwCheckErrors( NVPW_MetricsContext_EvaluateToGpuValuesPtr(&evalToGpuParams), goto fn_fail );
 
     /* store the gpu values */
-    for (i = 0; i < (int) gpu_ctl->event_names->count; i++) {
+    for (i = 0; i < (int) gpu_ctl->targeted_events->count; i++) {
         counts[i] = gpuValues[i];
     }
 
@@ -1299,14 +1287,13 @@ static int find_same_chipname(int gpu_id)
 {
     int i;
     for (i = 0; i < gpu_id; i++) {
-        if (!strcmp(avail_events[gpu_id].chip_name, avail_events[i].chip_name)) {
+        if (!strcmp(cuptiu_table_p->avail_gpu_info[gpu_id].chip_name, cuptiu_table_p->avail_gpu_info[i].chip_name)) {
             return i;
         }
     }
     return -1;
 }
 
-
 /** @class init_all_metrics
   * @brief Initialize metrics for a specific GPU.
   *        
@@ -1314,13 +1301,13 @@ static int find_same_chipname(int gpu_id)
 static int init_all_metrics(void)
 {
     int gpu_id, papi_errno = PAPI_OK;
-    avail_events = (list_metrics_t *) papi_calloc(num_gpus, sizeof(list_metrics_t));
-    if (avail_events == NULL) {
+    cuptiu_table_p->avail_gpu_info = (gpu_record_t *) papi_calloc(num_gpus, sizeof(gpu_record_t));
+    if (cuptiu_table_p->avail_gpu_info == NULL) {
         papi_errno = PAPI_ENOMEM;
         goto fn_exit;
     }
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
-        papi_errno = get_chip_name(gpu_id, avail_events[gpu_id].chip_name);
+        papi_errno = get_chip_name(gpu_id, cuptiu_table_p->avail_gpu_info[gpu_id].chip_name);
         if (papi_errno != PAPI_OK) {
             goto fn_exit;
         }
@@ -1329,7 +1316,7 @@ static int init_all_metrics(void)
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         found = find_same_chipname(gpu_id);
         if (found > -1) {
-            avail_events[gpu_id].pmetricsContextCreateParams = avail_events[found].pmetricsContextCreateParams;
+            cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = cuptiu_table_p->avail_gpu_info[found].pmetricsContextCreateParams;
             continue;
         }
         MCCP_t *pMCCP = (MCCP_t *) papi_calloc(1, sizeof(MCCP_t));
@@ -1338,10 +1325,10 @@ static int init_all_metrics(void)
             goto fn_exit;
         }
         pMCCP->structSize = NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE;
-        pMCCP->pChipName = avail_events[gpu_id].chip_name;
+        pMCCP->pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name;
         nvpwCheckErrors( NVPW_CUDA_MetricsContext_CreatePtr(pMCCP), goto fn_fail );
 
-        avail_events[gpu_id].pmetricsContextCreateParams = pMCCP;
+        cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = pMCCP;
     }
 
 fn_exit:
@@ -1359,57 +1346,52 @@ static void free_all_enumerated_metrics(void)
     COMPDBG("Entering.\n");
     int gpu_id, found;
     NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams;
-    if (avail_events == NULL) {
+    if (cuptiu_table_p->avail_gpu_info == NULL) {
         return;
     }
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         found = find_same_chipname(gpu_id);
         if (found > -1) {
-            avail_events[gpu_id].num_metrics = 0;
-            avail_events[gpu_id].cuptiu_table_p = NULL;
-            avail_events[gpu_id].pmetricsContextCreateParams = NULL;
+            cuptiu_table_p->avail_gpu_info[gpu_id].num_metrics = 0;
+            cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = NULL;
             continue;
         }
-        if (avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext) {
+        if (cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pMetricsContext) {
             metricsContextDestroyParams = (NVPW_MetricsContext_Destroy_Params) {
                 .structSize = NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE,
                 .pPriv = NULL,
-                .pMetricsContext = avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext,
+                .pMetricsContext = cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pMetricsContext,
             };
             nvpwCheckErrors(NVPW_MetricsContext_DestroyPtr(&metricsContextDestroyParams), );
         }
-        papi_free(avail_events[gpu_id].pmetricsContextCreateParams);
-        avail_events[gpu_id].pmetricsContextCreateParams = NULL;
+        papi_free(cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams);
+        cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams = NULL;
 
-        if (avail_events[gpu_id].cuptiu_table_p) {
-            cuptiu_event_table_destroy( &(avail_events[gpu_id].cuptiu_table_p) );
-        }
     }
-    papi_free(avail_events);
-    avail_events = NULL;
+    papi_free(cuptiu_table_p->avail_gpu_info);
+    cuptiu_table_p->avail_gpu_info = NULL;
 }
 
 /** @class init_main_htable
-  * @brief Initialize the main htable that will 
-  *        store the metric info for all devices..  
+ *  @brief Initialize the main htable used to collect metrics.
 */
 static void init_main_htable(void) 
 {
-    int htable_errno, i, val = 1, base = 2;
+    int i, val = 1, base = 2;
 
-    /* get the total number of possible metrics to allocate for,
-       as of now we allow only 2^NAMEID_WIDTH metrics */
+    /* allocate 2 ^ 21 metric names, this matches the number of bits for the event encoding format */
     for (i = 0; i < NAMEID_WIDTH; i++) {
         val *= base;
-    }
-
+    }    
+   
+    /* allocate memory */ 
     cuptiu_table_p = papi_malloc(sizeof(cuptiu_event_table_t));
-    cuptiu_table_p->capacity = val;
-    cuptiu_table_p->count = 0;
-
-    htable_init(&cuptiu_table_p->htable);
-
+    cuptiu_table_p->capacity = val; 
+    cuptiu_table_p->count = 0; 
     cuptiu_table_p->events = papi_calloc(val, sizeof(cuptiu_event_t)); 
+   
+    /* initialize the main hash table for metric collection */ 
+    htable_init(&cuptiu_table_p->htable);
 }
 
 /** @class cuptip_init
@@ -1447,6 +1429,9 @@ int cuptip_init(void)
         goto fn_fail;
     }
 
+    /* init htable and allocate memory */
+    init_main_htable();    
+
     papi_errno = init_all_metrics();
     if (papi_errno != PAPI_OK) {
         goto fn_fail;
@@ -1458,9 +1443,6 @@ int cuptip_init(void)
         goto fn_fail;
     }
 
-    /* initialize main hash table to store entries */
-    init_main_htable(); 
-
     /* initialize hash table with cuda native events */
     init_event_table();
 
@@ -1478,7 +1460,7 @@ int verify_events(uint64_t *events_id, int num_events,
     for (i = 0; i < num_gpus; i++) { 
         papi_errno = cuptiu_event_table_create_init_capacity(
                          num_events * num_gpus,
-                         sizeof(cuptiu_event_t), &(state->gpu_ctl[i].event_names)
+                         sizeof(cuptiu_event_t), &(state->gpu_ctl[i].targeted_events)
                      ); 
         if (papi_errno != PAPI_OK) {
             goto fn_exit;
@@ -1492,13 +1474,13 @@ int verify_events(uint64_t *events_id, int num_events,
             break;
         }    
         sprintf(name, "%s", cuptiu_table_p->events[info.nameid].name);
-        strcpy(state->gpu_ctl[info.device].event_names->added_cuda_evts[i], name);
-        state->gpu_ctl[info.device].event_names->added_cuda_dev[i] = info.device;
+        strcpy(state->gpu_ctl[info.device].targeted_events->added_cuda_evts[i], name);
+        state->gpu_ctl[info.device].targeted_events->added_cuda_dev[i] = info.device;
         void *p;
         if (htable_find(cuptiu_table_p->htable, name, (void **) &p) != HTABLE_SUCCESS) {
-            htable_insert(state->gpu_ctl[info.device].event_names->htable, name, (void **) &p );
+            htable_insert(state->gpu_ctl[info.device].targeted_events->htable, name, (void **) &p );
         }    
-        state->gpu_ctl[info.device].event_names->count++;
+        state->gpu_ctl[info.device].targeted_events->count++;
     }    
 
   fn_exit:     
@@ -1522,7 +1504,7 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
     int papi_errno = PAPI_OK, gpu_id, i;
     long long *counters = NULL;
     char name[PAPI_2MAX_STR_LEN] = { 0 };
-    cuptiu_event_table_t *targeted_event_names;
+    cuptiu_event_table_t *targeted_targeted_events;
 
     /* create a cuptip_control_t struct which contains read_count, running, cupti_info_t and cuptip_gpu_state_t */
     cuptip_control_t state = (cuptip_control_t) papi_calloc (1, sizeof(struct cuptip_control_s));
@@ -1542,7 +1524,7 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
     /* for each unique gpu store the gpu id for that gpu index */
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         state->gpu_ctl[gpu_id].gpu_id = gpu_id;
-        //state->gpu_ctl[gpu_id].event_names = targeted_event_names;
+        //state->gpu_ctl[gpu_id].targeted_events = targeted_targeted_events;
     }
 
     /* register the user created cuda context for the current gpu if not already known */
@@ -1602,11 +1584,11 @@ int cuptip_ctx_start(cuptip_control_t state)
     /* enumerate through all of the unique gpus */
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
-        if (gpu_ctl->event_names->count == 0) {
+        if (gpu_ctl->targeted_events->count == 0) {
             continue;
         }
-        LOGDBG("Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->event_names->count, gpu_ctl->rmr_count);
-        papi_errno = cuptic_device_acquire(state->gpu_ctl[gpu_id].event_names);
+        LOGDBG("Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->targeted_events->count, gpu_ctl->rmr_count);
+        papi_errno = cuptic_device_acquire(state->gpu_ctl[gpu_id].targeted_events);
         if (papi_errno != PAPI_OK) {
             ERRDBG("Profiling same gpu from multiple event sets not allowed.\n");
             return papi_errno;
@@ -1680,7 +1662,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
 
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
-        if (gpu_ctl->event_names->count == 0) {
+        if (gpu_ctl->targeted_events->count == 0) {
             continue;
         }
 
@@ -1718,13 +1700,13 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
         if (papi_errno != PAPI_OK) {
             goto fn_exit;
         }
-        for (i = 0; i < (int) gpu_ctl->event_names->count; i++) {
+        for (i = 0; i < (int) gpu_ctl->targeted_events->count; i++) {
             if (state->read_count == 0) {
                 counter_vals[i] = counts[i];
             }
             else {
                 /* determine collection method such as max, min, sum, and avg for an added Cuda native event */
-                method = get_event_collection_method(gpu_ctl->event_names->added_cuda_evts[i]);
+                method = get_event_collection_method(gpu_ctl->targeted_events->added_cuda_evts[i]);
                 switch (method) {
                     case CUDA_SUM:
                         counter_vals[i] += counts[i];
@@ -1826,7 +1808,7 @@ int cuptip_ctx_stop(cuptip_control_t state)
 
     for (gpu_id=0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
-        if (gpu_ctl->event_names->count == 0) {
+        if (gpu_ctl->targeted_events->count == 0) {
             continue;
         }
         papi_errno = cuptic_ctxarr_get_ctx(state->info, gpu_id, &ctx);
@@ -1835,7 +1817,7 @@ int cuptip_ctx_stop(cuptip_control_t state)
         if (papi_errno != PAPI_OK) {
             goto fn_fail;
         }
-        papi_errno = cuptic_device_release(state->gpu_ctl[gpu_id].event_names);
+        papi_errno = cuptic_device_release(state->gpu_ctl[gpu_id].targeted_events);
         if (papi_errno != PAPI_OK) {
             goto fn_fail;
         }
@@ -1865,7 +1847,7 @@ int cuptip_ctx_destroy(cuptip_control_t *pstate)
     int papi_errno = nvpw_cuda_metricscontext_destroy(state);
     for (i = 0; i < num_gpus; i++) {
         reset_cupti_prof_config_images( &(state->gpu_ctl[i]) );
-        cuptiu_event_table_destroy( &(state->gpu_ctl[i].event_names) );
+        cuptiu_event_table_destroy( &(state->gpu_ctl[i].targeted_events) );
         for (j = 0; j < state->gpu_ctl[i].rmr_count; j++) {
             papi_free((void *) state->gpu_ctl[i].rmr[j].pMetricName);
         }
@@ -1955,9 +1937,11 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info)
         return PAPI_ENOEVNT;
     }
 
+    /* 
     if (cuptiu_dev_check(cuptiu_table_p->events[info->nameid].device_map, info->device) == 0) {
         return PAPI_ENOEVNT;
-    }
+   }
+   */
 
     if (info->nameid >= cuptiu_table_p->count) {
         return PAPI_ENOEVNT;
@@ -1977,7 +1961,7 @@ int init_event_table(void)
     /* instatiate struct to collect the total metric count and metric names;
        instantiated here to avoid scoping issues */
     NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = { NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE };
-
+    
     /* loop through all available devices on the current system */
     for (dev_id = 0; dev_id < num_gpus; dev_id++) {
         found = find_same_chipname(dev_id);
@@ -1989,7 +1973,7 @@ int init_event_table(void)
 
             /* assigning values to member variables */
             getMetricNameBeginParams.pPriv = NULL;
-            getMetricNameBeginParams.pMetricsContext = avail_events[table_idx].pmetricsContextCreateParams->pMetricsContext;
+            getMetricNameBeginParams.pMetricsContext = cuptiu_table_p->avail_gpu_info[table_idx].pmetricsContextCreateParams->pMetricsContext;
             getMetricNameBeginParams.hidePeakSubMetrics = !listsubmetrics;
             getMetricNameBeginParams.hidePerCycleSubMetrics = !listsubmetrics;
             getMetricNameBeginParams.hidePctOfPeakSubMetrics = !listsubmetrics;
@@ -1997,24 +1981,18 @@ int init_event_table(void)
             nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_BeginPtr(&getMetricNameBeginParams), goto fn_fail ); 
 
             /* for each unique device found, store both the total number of metrics and metric names */
-            avail_events[table_idx].num_metrics = getMetricNameBeginParams.numMetrics;
-            avail_events[table_idx].metric_names = getMetricNameBeginParams.ppMetricNames; 
-
-            papi_errno = cuptiu_event_table_create_init_capacity(avail_events[table_idx].num_metrics * num_gpus, sizeof(cuptiu_event_t), &(avail_events[table_idx].cuptiu_table_p));
-            if (papi_errno != PAPI_OK) {
-                goto fn_exit;
-            }
-            avail_events[table_idx].cuptiu_table_p->events = papi_calloc(avail_events[table_idx].num_metrics, sizeof(cuptiu_event_t));
+            cuptiu_table_p->avail_gpu_info[table_idx].num_metrics = getMetricNameBeginParams.numMetrics;
+            cuptiu_table_p->avail_gpu_info[table_idx].metric_names = getMetricNameBeginParams.ppMetricNames;
         }
         /* device metadata already collected, set table index */
         else {
             /* set table_idx to */
             table_idx = found;
         }
-     
+
         /* loop through metrics to add to overall event table */
-        for (i = 0; i < avail_events[table_idx].num_metrics; i++) {
-            papi_errno = get_ntv_events( cuptiu_table_p, avail_events[table_idx].metric_names[i], i, 0, dev_id);
+        for (i = 0; i < cuptiu_table_p->avail_gpu_info[table_idx].num_metrics; i++) {
+            papi_errno = get_ntv_events( cuptiu_table_p, cuptiu_table_p->avail_gpu_info[table_idx].metric_names[i], i, 0, dev_id);
             if (papi_errno != PAPI_OK)
                 goto fn_exit;
         }
@@ -2026,7 +2004,7 @@ int init_event_table(void)
         NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = {
             .structSize = NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE,
             .pPriv = NULL,
-            .pMetricsContext = avail_events[gpu_idx].pmetricsContextCreateParams->pMetricsContext,
+            .pMetricsContext = cuptiu_table_p->avail_gpu_info[gpu_idx].pmetricsContextCreateParams->pMetricsContext,
         };
         nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_EndPtr((NVPW_MetricsContext_GetMetricNames_End_Params *) &getMetricNameEndParams), goto fn_fail );
     }
@@ -2084,7 +2062,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name,
             return PAPI_ESYS;
         }
     }
-
+    
     cuptiu_dev_set(&event->device_map, gpu_id);
 
     return PAPI_OK;
@@ -2333,6 +2311,7 @@ int cuptip_evt_enum(uint64_t *event_code, int modifier)
         case PAPI_ENUM_EVENTS:
             papi_errno = evt_id_to_info(*event_code, &info);
             if (papi_errno != PAPI_OK) {
+                printf("We fail enum_events evt_id_to_info.\n");
                 break;
             }
             if (cuptiu_table_p->count > info.nameid + 1) {
@@ -2340,6 +2319,9 @@ int cuptip_evt_enum(uint64_t *event_code, int modifier)
                 info.flags = 0;
                 info.nameid++;
                 papi_errno = evt_id_create(&info, event_code);
+                if (papi_errno != PAPI_OK) {
+                    printf("Failed to create id.\n");
+                }
                 break;
             }
             papi_errno = PAPI_END;
@@ -2524,7 +2506,7 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
     }
 
     /* collect the description and calculated numpass for a specific nameid */
-    if (cuptiu_table_p->events[inf.nameid].desc[0] == 0) {
+    if (cuptiu_table_p->events[inf.nameid].desc[0] == '\0') {
         /* find a matching device id to get correct MetricsContext and chip name */
         for (i = 0; i < num_gpus; ++i) {
             if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) {
@@ -2532,9 +2514,9 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
                 break;
             }
         }
-        papi_errno = retrieve_metric_descr( avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext,
+        papi_errno = retrieve_metric_descr( cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pMetricsContext,
                                             cuptiu_table_p->events[inf.nameid].name, cuptiu_table_p->events[inf.nameid].desc,
-                                            avail_events[gpu_id].pmetricsContextCreateParams->pChipName );
+                                            cuptiu_table_p->avail_gpu_info[gpu_id].pmetricsContextCreateParams->pChipName );
         if (papi_errno != PAPI_OK) {
             return papi_errno;
         }
@@ -2551,16 +2533,22 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
             break;
         case DEVICE_FLAG:
         {
+            int init_metric_dev_id;
             char devices[PAPI_MAX_STR_LEN] = { 0 };
             for (i = 0; i < num_gpus; ++i) {
                 if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) {
+                    /* for a metric, store the first device found to use with :device=# */
+                    if (devices[0] == '\0') {
+                        init_metric_dev_id = i;
+                    }
+
                     sprintf(devices + strlen(devices), "%i,", i);
                 }
             }
             *(devices + strlen(devices) - 1) = 0;
 
             /* cuda native event name */
-            snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s:device=%i", cuptiu_table_p->events[inf.nameid].name, inf.device );
+            snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s:device=%i", cuptiu_table_p->events[inf.nameid].name, init_metric_dev_id );
             /* cuda native event short description */
             snprintf( info->short_descr, PAPI_MIN_STR_LEN, "%s masks:Mandatory device qualifier [%s]",
                      cuptiu_table_p->events[inf.nameid].desc, devices );
diff --git a/src/components/cuda/cupti_utils.h b/src/components/cuda/cupti_utils.h
index b79dd5130..9031ce97f 100644
--- a/src/components/cuda/cupti_utils.h
+++ b/src/components/cuda/cupti_utils.h
@@ -9,10 +9,15 @@
 #define __CUPTI_UTILS_H__
 
 #include <papi.h>
+
+#include <nvperf_cuda_host.h> 
+
 #include <stdint.h>
 
+
 typedef int64_t cuptiu_bitmap_t;
 typedef int (*cuptiu_dev_get_map_cb)(uint64_t event_id, int *dev_id);
+typedef NVPW_CUDA_MetricsContext_Create_Params MCCP_t;
 
 typedef struct event_record_s {
     char name[PAPI_2MAX_STR_LEN];
@@ -20,11 +25,19 @@ typedef struct event_record_s {
     cuptiu_bitmap_t device_map;
 } cuptiu_event_t;
 
+typedef struct gpu_record_s {
+    char chip_name[32];
+    MCCP_t *pmetricsContextCreateParams;
+    int num_metrics;
+    const char* const* metric_names;
+} gpu_record_t;
+
 typedef struct event_table_s {
     unsigned int count;
     unsigned int capacity;
     char added_cuda_evts[30][PAPI_2MAX_STR_LEN];
     int added_cuda_dev[30];
+    gpu_record_t *avail_gpu_info;
     cuptiu_event_t *events;
     void *htable;
 } cuptiu_event_table_t;

From 21f78f263f550bfe8dc7c3915e63dcb25d8265d8 Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Thu, 23 Jan 2025 19:49:56 +0000
Subject: [PATCH 06/16] More changes to cuda workflow to allow for multiple
 gpus.

---
 src/components/cuda/cupti_profiler.c | 103 +++++++++++----------------
 src/components/cuda/cupti_utils.h    |   4 +-
 2 files changed, 43 insertions(+), 64 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index d29bb3340..fce17b41f 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -497,13 +497,11 @@ static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl)
 
     /* for each event in the event table collect the raw metric requests */
     for (i = 0; i < gpu_ctl->targeted_events->count; i++) {
-        /* Not using the correct global event names now.*/
         papi_errno = retrieve_metric_rmr(
                          gpu_ctl->pmetricsContextCreateParams->pMetricsContext,
                          gpu_ctl->targeted_events->added_cuda_evts[i], &num_dep, 
                          &collect_rmr
                      );
-        /* why is PAPI_ENOEVNT hard coded? */
         if (papi_errno != PAPI_OK) {
             papi_errno = PAPI_ENOEVNT;
             goto fn_exit;
@@ -813,7 +811,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pChipName = cuptiu_table_p->avail_gpu_info[gpu_ctl->gpu_id].chip_name,
     };
     nvpwCheckErrors( NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams), goto fn_fail );
-    printf("Made it past first nvpwCheckErrors.\n");
 
     if( gpu_ctl->counterAvailabilityImage.data != NULL) {
         NVPW_RawMetricsConfig_SetCounterAvailability_Params setCounterAvailabilityParams = {
@@ -823,7 +820,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
             .pCounterAvailabilityImage = gpu_ctl->counterAvailabilityImage.data,
         };
         nvpwCheckErrors( NVPW_RawMetricsConfig_SetCounterAvailabilityPtr(&setCounterAvailabilityParams), goto fn_fail );
-        printf("Made it pass second nvpwCheckErrors.\n");
     };
 
     /* NOTE: maxPassCount is being set to 1 as a final safety net to limit metric collection to a single pass.
@@ -837,8 +833,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .maxPassCount = 1,
     };
     nvpwCheckErrors( NVPW_RawMetricsConfig_BeginPassGroupPtr(&beginPassGroupParams), goto fn_fail );
-    printf("Made it pass third nvpwCheckErrors.\n");
-
 
     NVPW_RawMetricsConfig_AddMetrics_Params addMetricsParams = {
         .structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE,
@@ -847,13 +841,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pRawMetricRequests = gpu_ctl->rmr,
         .numMetricRequests = gpu_ctl->rmr_count,
     };
-    //nvpwCheckErrors( NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams), goto fn_fail );
-    NVPA_Status _status = NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams);
-    if (_status != NVPA_STATUS_SUCCESS) {
-        printf("Failed with status: %d\n", _status);
-    } 
-    printf("Made it pass fourth nvpwCheckErrors.\n");
-
+    nvpwCheckErrors( NVPW_RawMetricsConfig_AddMetricsPtr(&addMetricsParams), goto fn_fail );
 
     NVPW_RawMetricsConfig_EndPassGroup_Params endPassGroupParams = {
         .structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE,
@@ -861,7 +849,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
     };
     nvpwCheckErrors( NVPW_RawMetricsConfig_EndPassGroupPtr(&endPassGroupParams), goto fn_fail );
-    printf("Made it past fifth nvpwCheckErrors.\n");
 
     NVPW_RawMetricsConfig_GenerateConfigImage_Params generateConfigImageParams = {
         .structSize = NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE,
@@ -869,7 +856,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
     };
     nvpwCheckErrors( NVPW_RawMetricsConfig_GenerateConfigImagePtr(&generateConfigImageParams), goto fn_fail );
-    printf("Made it past sixth nvpwCheckErrors.\n");
 
     NVPW_RawMetricsConfig_GetConfigImage_Params getConfigImageParams = {
         .structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE,
@@ -879,7 +865,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pBuffer = NULL,
     };
     nvpwCheckErrors( NVPW_RawMetricsConfig_GetConfigImagePtr(&getConfigImageParams), goto fn_fail );
-    printf("Made it past seventh nvpwCheckErrors.\n");
 
     gpu_ctl->configImage.size = getConfigImageParams.bytesCopied;
     gpu_ctl->configImage.data = (uint8_t *) papi_calloc(gpu_ctl->configImage.size, sizeof(uint8_t));
@@ -891,7 +876,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
     getConfigImageParams.bytesAllocated = gpu_ctl->configImage.size;
     getConfigImageParams.pBuffer = gpu_ctl->configImage.data;
     nvpwCheckErrors( NVPW_RawMetricsConfig_GetConfigImagePtr(&getConfigImageParams), goto fn_fail );
-    printf("Made it past eigth nvpwCheckErrors.\n");
 
     NVPW_RawMetricsConfig_Destroy_Params rawMetricsConfigDestroyParams = {
         .structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE,
@@ -899,7 +883,6 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
         .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
     };
     nvpwCheckErrors( NVPW_RawMetricsConfig_DestroyPtr((NVPW_RawMetricsConfig_Destroy_Params *) &rawMetricsConfigDestroyParams), goto fn_fail );
-    printf("Made it past ninth nvpwCheckErrors.\n");
 
     return PAPI_OK;
 fn_fail:
@@ -1123,7 +1106,7 @@ static int begin_profiling(cuptip_gpu_state_t *gpu_ctl)
     };
     cuptiCheckErrors( cuptiProfilerEnableProfilingPtr(&enableProfilingParams), goto fn_fail );
 
-    char rangeName[64];
+    char rangeName[PAPI_MIN_STR_LEN];
     sprintf(rangeName, "PAPI_Range_%d", gpu_ctl->gpu_id);
     CUpti_Profiler_PushRange_Params pushRangeParams = {
         .structSize = CUpti_Profiler_PushRange_Params_STRUCT_SIZE,
@@ -1451,19 +1434,30 @@ int cuptip_init(void)
     return PAPI_EMISC;
 }
 
+/** @class verify_events
+  * @brief Verify user added events and store metadata i.e. metric names 
+  *        and device id's .
+  * @param *events_id
+  *   Cuda native event id's.
+  * @param num_events
+  *   Number of Cuda native events a user is wanting to count.
+  * @param state
+  *   Struct that holds read count, running, cuptip_info_t, and 
+  *   cuptip_gpu_state_t. 
+*/
 int verify_events(uint64_t *events_id, int num_events, 
                   cuptip_control_t state) 
 {
-    int papi_errno = PAPI_OK, i;
-    char name[PAPI_MAX_STR_LEN] = { 0 }; 
+    int papi_errno, i;
+    char *metricName;
 
     for (i = 0; i < num_gpus; i++) { 
         papi_errno = cuptiu_event_table_create_init_capacity(
-                         num_events * num_gpus,
+                         num_events,
                          sizeof(cuptiu_event_t), &(state->gpu_ctl[i].targeted_events)
                      ); 
         if (papi_errno != PAPI_OK) {
-            goto fn_exit;
+            return papi_errno;
         }
      }  
 
@@ -1472,19 +1466,21 @@ int verify_events(uint64_t *events_id, int num_events,
         papi_errno = evt_id_to_info(events_id[i], &info);
         if (papi_errno != PAPI_OK) {
             break;
-        }    
-        sprintf(name, "%s", cuptiu_table_p->events[info.nameid].name);
-        strcpy(state->gpu_ctl[info.device].targeted_events->added_cuda_evts[i], name);
-        state->gpu_ctl[info.device].targeted_events->added_cuda_dev[i] = info.device;
+        }
+       
+        /* store metadata i.e. metric names and device id's */
+        metricName = state->gpu_ctl[info.device].targeted_events->added_cuda_evts[i];
+        snprintf(metricName, PAPI_MAX_STR_LEN, "%s", cuptiu_table_p->events[info.nameid].name);
+
+        state->gpu_ctl[info.device].targeted_events->added_cuda_dev[i] = info.device; 
+
         void *p;
-        if (htable_find(cuptiu_table_p->htable, name, (void **) &p) != HTABLE_SUCCESS) {
-            htable_insert(state->gpu_ctl[info.device].targeted_events->htable, name, (void **) &p );
+        if (htable_find(cuptiu_table_p->htable, metricName, (void **) &p) != HTABLE_SUCCESS) {
+            return PAPI_ENOEVNT;
         }    
         state->gpu_ctl[info.device].targeted_events->count++;
-    }    
-
-  fn_exit:     
-    return papi_errno;  
+    }
+    return PAPI_OK;    
 }
 
 /** @class cuptip_ctx_create
@@ -1607,20 +1603,15 @@ int cuptip_ctx_start(cuptip_control_t state)
 
         /* CUPTI profiler host configuration */
         papi_errno = metric_get_config_image(gpu_ctl);
-        printf("papi_errno first: %d\n", papi_errno);
         papi_errno += metric_get_counter_data_prefix_image(gpu_ctl);
-        printf("papi_errno second: %d\n", papi_errno);
         papi_errno += create_counter_data_image(gpu_ctl);
-         printf("papi_errno third: %d\n", papi_errno);
         if (papi_errno != PAPI_OK) {
-            printf("cupti profiler host configuration.\n");
             ERRDBG("Failed to create CUPTI profiler state for gpu %d\n", gpu_id);
             goto fn_fail;
         }
 
         papi_errno = begin_profiling(gpu_ctl);
         if (papi_errno != PAPI_OK) {
-            printf("begin profiling failed.\n");
             ERRDBG("Failed to start profiling for gpu %d\n", gpu_id);
             goto fn_fail;
         }
@@ -1700,7 +1691,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
         if (papi_errno != PAPI_OK) {
             goto fn_exit;
         }
-        for (i = 0; i < (int) gpu_ctl->targeted_events->count; i++) {
+        for (i = 0; i < gpu_ctl->targeted_events->count; i++) {
             if (state->read_count == 0) {
                 counter_vals[i] = counts[i];
             }
@@ -1909,9 +1900,11 @@ int cuptip_shutdown(void)
 */
 int evt_id_create(event_info_t *info, uint64_t *event_id)
 {
+
     *event_id  = (uint64_t)(info->device   << DEVICE_SHIFT);
     *event_id |= (uint64_t)(info->flags    << QLMASK_SHIFT);
     *event_id |= (uint64_t)(info->nameid   << NAMEID_SHIFT);
+
     return PAPI_OK;
 }
 
@@ -1937,12 +1930,6 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info)
         return PAPI_ENOEVNT;
     }
 
-    /* 
-    if (cuptiu_dev_check(cuptiu_table_p->events[info->nameid].device_map, info->device) == 0) {
-        return PAPI_ENOEVNT;
-   }
-   */
-
     if (info->nameid >= cuptiu_table_p->count) {
         return PAPI_ENOEVNT;
     }
@@ -2043,8 +2030,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name,
 
     /* check to see if capacity has been correctly allocated */
     if (evt_table->count >= evt_table->capacity) {
-        printf("Table count is larger than allocated capacity.\n");
-        return PAPI_ENOMEM;
+        return PAPI_EBUG;
     }
 
     cuptiu_event_t *event;
@@ -2311,7 +2297,6 @@ int cuptip_evt_enum(uint64_t *event_code, int modifier)
         case PAPI_ENUM_EVENTS:
             papi_errno = evt_id_to_info(*event_code, &info);
             if (papi_errno != PAPI_OK) {
-                printf("We fail enum_events evt_id_to_info.\n");
                 break;
             }
             if (cuptiu_table_p->count > info.nameid + 1) {
@@ -2319,9 +2304,6 @@ int cuptip_evt_enum(uint64_t *event_code, int modifier)
                 info.flags = 0;
                 info.nameid++;
                 papi_errno = evt_id_create(&info, event_code);
-                if (papi_errno != PAPI_OK) {
-                    printf("Failed to create id.\n");
-                }
                 break;
             }
             papi_errno = PAPI_END;
@@ -2495,17 +2477,17 @@ static int evt_code_to_name(uint64_t event_code, char *name, int len)
 int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
 {
 
-    int papi_errno, len, i, gpu_id;
-    event_info_t inf;
+    int papi_errno, i, gpu_id;
     char description[PAPI_HUGE_STR_LEN];
 
-    /* get event code info */
+    /* get the events nameid and flags */
+    event_info_t inf;
     papi_errno = evt_id_to_info(event_code, &inf);
     if (papi_errno != PAPI_OK) {
         return papi_errno;
     }
 
-    /* collect the description and calculated numpass for a specific nameid */
+    /* collect the description and calculated numpass for the Cuda event  */
     if (cuptiu_table_p->events[inf.nameid].desc[0] == '\0') {
         /* find a matching device id to get correct MetricsContext and chip name */
         for (i = 0; i < num_gpus; ++i) {
@@ -2524,11 +2506,9 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
 
     switch (inf.flags) {
         case (0):
-            /* cuda native event name */
+            /* store details for the Cuda event */ 
             snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s", cuptiu_table_p->events[inf.nameid].name );
-            /* cuda native event short description */
             snprintf( info->short_descr, PAPI_MIN_STR_LEN, "%s", cuptiu_table_p->events[inf.nameid].desc );
-            /* cuda native event long description */
             snprintf( info->long_descr, PAPI_HUGE_STR_LEN, "%s", cuptiu_table_p->events[inf.nameid].desc );
             break;
         case DEVICE_FLAG:
@@ -2537,7 +2517,8 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
             char devices[PAPI_MAX_STR_LEN] = { 0 };
             for (i = 0; i < num_gpus; ++i) {
                 if (cuptiu_dev_check(cuptiu_table_p->events[inf.nameid].device_map, i)) {
-                    /* for a metric, store the first device found to use with :device=# */
+                    /* for an event, store the first device found to use with :device=#, 
+                       as on a heterogenous system events may not appear on each device */
                     if (devices[0] == '\0') {
                         init_metric_dev_id = i;
                     }
@@ -2547,12 +2528,10 @@ int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
             }
             *(devices + strlen(devices) - 1) = 0;
 
-            /* cuda native event name */
+            /* store details for the Cuda event */
             snprintf( info->symbol, PAPI_HUGE_STR_LEN, "%s:device=%i", cuptiu_table_p->events[inf.nameid].name, init_metric_dev_id );
-            /* cuda native event short description */
             snprintf( info->short_descr, PAPI_MIN_STR_LEN, "%s masks:Mandatory device qualifier [%s]",
                      cuptiu_table_p->events[inf.nameid].desc, devices );
-            /* cuda native event long description */
             snprintf( info->long_descr, PAPI_HUGE_STR_LEN, "%s masks:Mandatory device qualifier [%s]",
                       cuptiu_table_p->events[inf.nameid].desc, devices );
             break;
diff --git a/src/components/cuda/cupti_utils.h b/src/components/cuda/cupti_utils.h
index 9031ce97f..1c8c8d1a9 100644
--- a/src/components/cuda/cupti_utils.h
+++ b/src/components/cuda/cupti_utils.h
@@ -26,14 +26,14 @@ typedef struct event_record_s {
 } cuptiu_event_t;
 
 typedef struct gpu_record_s {
-    char chip_name[32];
+    char chip_name[PAPI_MIN_STR_LEN];
     MCCP_t *pmetricsContextCreateParams;
     int num_metrics;
     const char* const* metric_names;
 } gpu_record_t;
 
 typedef struct event_table_s {
-    unsigned int count;
+    int count;
     unsigned int capacity;
     char added_cuda_evts[30][PAPI_2MAX_STR_LEN];
     int added_cuda_dev[30];

From bb7e9aad51261491eaeceabd293023bfdb68efad Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Sun, 2 Feb 2025 00:45:17 +0000
Subject: [PATCH 07/16] Pushing up to test on Voltar at oregon

---
 src/components/cuda/cupti_profiler.c    | 133 +++++++++++++-----------
 src/components/cuda/cupti_utils.h       |   5 +-
 src/components/cuda/linux-cuda.c        |  25 +++--
 src/components/cuda/papi_cupti_common.c |  12 ++-
 4 files changed, 103 insertions(+), 72 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index fce17b41f..873c80f65 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -20,6 +20,7 @@
 #include "cupti_config.h"
 #include "lcuda_debug.h"
 #include "htable.h"
+#include <threads.h>
 
 /**
  * Event identifier encoding format:
@@ -63,7 +64,7 @@ struct byte_array_s {
 
 struct cuptip_gpu_state_s {
     int                    gpu_id;
-    cuptiu_event_table_t  *targeted_events;
+    cuptiu_event_table_t  *added_events;
     int                    rmr_count;
     NVPA_RawMetricRequest *rmr;
     MCCP_t                *pmetricsContextCreateParams;
@@ -91,6 +92,7 @@ static gpu_record_t *avail_gpu_info;
 
 static cuptiu_event_table_t *cuptiu_table_p;
 
+
 /* load and unload cuda function pointers */
 static int load_cupti_perf_sym(void);
 static int unload_cupti_perf_sym(void);
@@ -149,7 +151,7 @@ static int retrieve_metric_rmr( NVPA_MetricsContext *pMetricsContext, const char
 
 /* misc */
 static int get_event_collection_method(const char *evt_name);
-static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl);
+static int get_added_events_rmr(cuptip_gpu_state_t *gpu_ctl);
 static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl);
 static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts);
 static int num_unique_devs(int num_gpus);
@@ -477,7 +479,7 @@ static int get_chip_name(int dev_num, char* chipName)
     return PAPI_OK;
 }
 
-/** @class get_targeted_events_rmr
+/** @class get_added_events_rmr
   * @brief For a Cuda native event name collect raw metrics and count
   *        of raw metrics for collection. Raw Metrics are one layer of the Metric API
   *        and contains the list of raw counters and generates configuration file
@@ -487,7 +489,7 @@ static int get_chip_name(int dev_num, char* chipName)
   *   Structure of type cuptip_gpu_state_t which has member variables such as 
   *   gpu_id, rmr, rmr_count, and more.
 */
-static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl)
+static int get_added_events_rmr(cuptip_gpu_state_t *gpu_ctl)
 {
     COMPDBG("Entering.\n");
     int gpu_id, num_dep, count_raw_metrics = 0, papi_errno = PAPI_OK;
@@ -496,10 +498,10 @@ static int get_targeted_events_rmr(cuptip_gpu_state_t *gpu_ctl)
     cuptiu_event_t *evt_rec;
 
     /* for each event in the event table collect the raw metric requests */
-    for (i = 0; i < gpu_ctl->targeted_events->count; i++) {
+    for (i = 0; i < gpu_ctl->added_events->count; i++) {
         papi_errno = retrieve_metric_rmr(
                          gpu_ctl->pmetricsContextCreateParams->pMetricsContext,
-                         gpu_ctl->targeted_events->added_cuda_evts[i], &num_dep, 
+                         gpu_ctl->added_events->cuda_evts[i], &num_dep, 
                          &collect_rmr
                      );
         if (papi_errno != PAPI_OK) {
@@ -704,11 +706,11 @@ static int check_multipass(cuptip_control_t state)
 
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
-        if (gpu_ctl->targeted_events->count == 0) {
+        if (gpu_ctl->added_events->count == 0) {
             continue;
         }
 
-        papi_errno = get_targeted_events_rmr(gpu_ctl);
+        papi_errno = get_added_events_rmr(gpu_ctl);
         if (papi_errno != PAPI_OK) {
             goto fn_exit;
         }
@@ -795,7 +797,7 @@ static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl)
 
 /** @class metric_get_config_image
   * @brief Retrieves binary ConfigImage for the Cuda native event metrics listed 
-  *        for collection. The function get_targeted_events_rmr( ... ) must be 
+  *        for collection. The function get_added_events_rmr( ... ) must be 
   *        called before this step is possible. 
   * @param *gpu_ctl
   *   Structure of type cuptip_gpu_state_t which has member variables such as 
@@ -804,11 +806,13 @@ static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl)
 static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
 {
     COMPDBG("Entering.\n");
+    int gpu_id = gpu_ctl->gpu_id;
+
     NVPW_CUDA_RawMetricsConfig_Create_Params nvpw_metricsConfigCreateParams = {
         .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE,
         .pPriv = NULL,
         .activityKind = NVPA_ACTIVITY_KIND_PROFILER,
-        .pChipName = cuptiu_table_p->avail_gpu_info[gpu_ctl->gpu_id].chip_name,
+        .pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name,
     };
     nvpwCheckErrors( NVPW_CUDA_RawMetricsConfig_CreatePtr(&nvpw_metricsConfigCreateParams), goto fn_fail );
 
@@ -820,7 +824,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
             .pCounterAvailabilityImage = gpu_ctl->counterAvailabilityImage.data,
         };
         nvpwCheckErrors( NVPW_RawMetricsConfig_SetCounterAvailabilityPtr(&setCounterAvailabilityParams), goto fn_fail );
-    };
+    }
 
     /* NOTE: maxPassCount is being set to 1 as a final safety net to limit metric collection to a single pass.
              Metrics that require multiple passes would fail further down at AddMetrics due to this.
@@ -891,7 +895,7 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
 
 /** @class metric_get_counter_data_prefix_image
   * @brief Retrieves binary CounterDataPrefix for the Cuda native event metrics 
-  *        listed for collection. The function get_targeted_events_rmr( ... ) 
+  *        listed for collection. The function get_added_events_rmr( ... ) 
   *        must be called before this step is possible. 
   * @param *gpu_ctl
   *   Structure of type cuptip_gpu_state_t which has member variables such as 
@@ -1195,7 +1199,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts)
 {
     COMPDBG("eval_metric_values. dev = %d\n", gpu_ctl->gpu_id);
     int i, papi_errno = PAPI_OK;
-    int numMetrics = gpu_ctl->targeted_events->count;
+    int numMetrics = gpu_ctl->added_events->count;
     double *gpuValues;
     char **metricNames;
 
@@ -1219,7 +1223,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts)
     }    
 
     for (i = 0; i < numMetrics; i++) {
-        metricNames[i] = gpu_ctl->targeted_events->added_cuda_evts[i];
+        metricNames[i] = gpu_ctl->added_events->cuda_evts[i];
         LOGDBG("Setting metric name %s\n", metricNames[i]);
     }
 
@@ -1246,7 +1250,7 @@ static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts)
     nvpwCheckErrors( NVPW_MetricsContext_EvaluateToGpuValuesPtr(&evalToGpuParams), goto fn_fail );
 
     /* store the gpu values */
-    for (i = 0; i < (int) gpu_ctl->targeted_events->count; i++) {
+    for (i = 0; i < (int) gpu_ctl->added_events->count; i++) {
         counts[i] = gpuValues[i];
     }
 
@@ -1398,7 +1402,6 @@ int cuptip_init(void)
         goto fn_fail;
     }
 
-    /* if no gpu's are found exit */
     if (num_gpus <= 0) {
         cuptic_disabled_reason_set("No GPUs found on system.");
         goto fn_fail;
@@ -1412,13 +1415,15 @@ int cuptip_init(void)
         goto fn_fail;
     }
 
-    /* init htable and allocate memory */
+    /* initialize the main event table for metric collection */
     init_main_htable();    
 
     papi_errno = init_all_metrics();
     if (papi_errno != PAPI_OK) {
         goto fn_fail;
     }
+    /* initialize hash table with cuda native events */
+    init_event_table();
 
     papi_errno = cuInitPtr(0);
     if (papi_errno != CUDA_SUCCESS) {
@@ -1426,9 +1431,6 @@ int cuptip_init(void)
         goto fn_fail;
     }
 
-    /* initialize hash table with cuda native events */
-    init_event_table();
-
     return PAPI_OK;
 fn_fail:
     return PAPI_EMISC;
@@ -1450,11 +1452,12 @@ int verify_events(uint64_t *events_id, int num_events,
 {
     int papi_errno, i;
     char *metricName;
+    int idx;
 
-    for (i = 0; i < num_gpus; i++) { 
+    for (i = 0; i < num_gpus; i++) {
         papi_errno = cuptiu_event_table_create_init_capacity(
                          num_events,
-                         sizeof(cuptiu_event_t), &(state->gpu_ctl[i].targeted_events)
+                         sizeof(cuptiu_event_t), &(state->gpu_ctl[i].added_events)
                      ); 
         if (papi_errno != PAPI_OK) {
             return papi_errno;
@@ -1465,21 +1468,24 @@ int verify_events(uint64_t *events_id, int num_events,
         event_info_t info;
         papi_errno = evt_id_to_info(events_id[i], &info);
         if (papi_errno != PAPI_OK) {
-            break;
+            return papi_errno;
         }
-       
-        /* store metadata i.e. metric names and device id's */
-        metricName = state->gpu_ctl[info.device].targeted_events->added_cuda_evts[i];
-        snprintf(metricName, PAPI_MAX_STR_LEN, "%s", cuptiu_table_p->events[info.nameid].name);
+ 
+        /* for the current device table get the next event index  */
+        idx = state->gpu_ctl[info.device].added_events->count; 
 
-        state->gpu_ctl[info.device].targeted_events->added_cuda_dev[i] = info.device; 
+        metricName = state->gpu_ctl[info.device].added_events->cuda_evts[idx];
+        snprintf(metricName, PAPI_MAX_STR_LEN, "%s", cuptiu_table_p->events[info.nameid].name);
 
         void *p;
         if (htable_find(cuptiu_table_p->htable, metricName, (void **) &p) != HTABLE_SUCCESS) {
             return PAPI_ENOEVNT;
-        }    
-        state->gpu_ctl[info.device].targeted_events->count++;
+        }
+        state->gpu_ctl[info.device].added_events->cuda_devs[idx] = info.device;
+        state->gpu_ctl[info.device].added_events->evt_pos[idx] = i; 
+        state->gpu_ctl[info.device].added_events->count++; /* total number of events added for a specific device  */
     }
+
     return PAPI_OK;    
 }
 
@@ -1500,27 +1506,25 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
     int papi_errno = PAPI_OK, gpu_id, i;
     long long *counters = NULL;
     char name[PAPI_2MAX_STR_LEN] = { 0 };
-    cuptiu_event_table_t *targeted_targeted_events;
 
-    /* create a cuptip_control_t struct which contains read_count, running, cupti_info_t and cuptip_gpu_state_t */
     cuptip_control_t state = (cuptip_control_t) papi_calloc (1, sizeof(struct cuptip_control_s));
     if (state == NULL) {
         return PAPI_ENOMEM;
     }
 
-    /* allocate memory for the total number of gpus for the cuptip_gpu_state_t struct 
-       with the device qualifier refactor we only want to count the total number of unique gpus */
     state->gpu_ctl = (cuptip_gpu_state_t *) papi_calloc(num_gpus, sizeof(cuptip_gpu_state_t));
     if (state->gpu_ctl == NULL) {
         return PAPI_ENOMEM;
     }
 
     counters = papi_malloc(num_events * sizeof(*counters));
+    if (counters == NULL) {
+        return PAPI_ENOMEM;
+    }
 
     /* for each unique gpu store the gpu id for that gpu index */
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         state->gpu_ctl[gpu_id].gpu_id = gpu_id;
-        //state->gpu_ctl[gpu_id].targeted_events = targeted_targeted_events;
     }
 
     /* register the user created cuda context for the current gpu if not already known */
@@ -1529,18 +1533,19 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
         goto fn_exit;
     }
 
-    /* creates a pMetricsContext */
+    /* creates a MetricsContext */
     papi_errno = nvpw_cuda_metricscontext_create(state);
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
     }
 
+    /* verify user added events are available on the system */
     papi_errno = verify_events(events_id, num_events, state);
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
     }
 
-    /* multipass is not supported; therefore, we must check the Cuda native event */
+    /* check to make sure added events do not require multiple passes */
     papi_errno = check_multipass(state);
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
@@ -1580,16 +1585,16 @@ int cuptip_ctx_start(cuptip_control_t state)
     /* enumerate through all of the unique gpus */
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
-        if (gpu_ctl->targeted_events->count == 0) {
+        if (gpu_ctl->added_events->count == 0) {
             continue;
         }
-        LOGDBG("Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->targeted_events->count, gpu_ctl->rmr_count);
-        papi_errno = cuptic_device_acquire(state->gpu_ctl[gpu_id].targeted_events);
+        LOGDBG("Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->added_events->count, gpu_ctl->rmr_count);
+        papi_errno = cuptic_device_acquire(state->gpu_ctl[gpu_id].added_events);
         if (papi_errno != PAPI_OK) {
             ERRDBG("Profiling same gpu from multiple event sets not allowed.\n");
             return papi_errno;
         }
-        /* get the cuda context for the unique gpu */
+        /* get the cuda context  */
         papi_errno = cuptic_ctxarr_get_ctx(state->info, gpu_id, &ctx);
         /* bind the specified CUDA context to the calling CPU thread */
         cudaCheckErrors( cuCtxSetCurrentPtr(ctx), goto fn_fail_misc );
@@ -1640,8 +1645,9 @@ int cuptip_ctx_start(cuptip_control_t state)
 int cuptip_ctx_read(cuptip_control_t state, long long **counters)
 {
     COMPDBG("Entering.\n");
-    int papi_errno, gpu_id, i, j = 0, method;
-    long long counts[30], *counter_vals = state->counters;
+    int papi_errno, gpu_id, i, j = 0, method, evt_pos;
+    long long counts[30];
+    long long *counter_vals = state->counters;
     cuptip_gpu_state_t *gpu_ctl = NULL;
     CUcontext userCtx = NULL, ctx = NULL;
 
@@ -1650,10 +1656,9 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
         cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc );
         cudaArtCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc );
     }
-
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
-        if (gpu_ctl->targeted_events->count == 0) {
+        if (gpu_ctl->added_events->count == 0) {
             continue;
         }
 
@@ -1691,22 +1696,27 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
         if (papi_errno != PAPI_OK) {
             goto fn_exit;
         }
-        for (i = 0; i < gpu_ctl->targeted_events->count; i++) {
+
+        for (i = 0; i < gpu_ctl->added_events->count; i++) {
+            printf("counts value: %d and gpu_id: %d\n", counts[i], gpu_id);
+            printf("evt_pos: %d\n", gpu_ctl->added_events->evt_pos[i]);
+            evt_pos = gpu_ctl->added_events->evt_pos[i];
             if (state->read_count == 0) {
-                counter_vals[i] = counts[i];
+                counter_vals[evt_pos] = counts[i];
             }
             else {
+                printf("WE ENTER ELSE STATEMENT.\n");
                 /* determine collection method such as max, min, sum, and avg for an added Cuda native event */
-                method = get_event_collection_method(gpu_ctl->targeted_events->added_cuda_evts[i]);
+                method = get_event_collection_method(gpu_ctl->added_events->cuda_evts[i]);
                 switch (method) {
                     case CUDA_SUM:
-                        counter_vals[i] += counts[i];
+                        counter_vals[evt_pos] += counts[i];
                         break;
                     case CUDA_MIN:
-                        counter_vals[i] = counter_vals[i] < counts[i] ? counter_vals[i] : counts[i];
+                        counter_vals[evt_pos] = counter_vals[evt_pos] < counts[i] ? counter_vals[evt_pos] : counts[i];
                         break;
                     case CUDA_MAX:
-                        counter_vals[i] = counter_vals[i] > counts[i] ? counter_vals[i] : counts[i];
+                        counter_vals[evt_pos] = counter_vals[evt_pos] > counts[i] ? counter_vals[evt_pos] : counts[i];
                         break;
                     case CUDA_AVG:
                          /* (size * average + value) / (size + 1) 
@@ -1714,15 +1724,15 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
                             average - current average
                             value - number to add to the average
                          */
-                         counter_vals[i] = (state->read_count * counter_vals[j++] + counts[i]) / (state->read_count + 1);
+                         counter_vals[evt_pos] = (state->read_count * counter_vals[j++] + counts[i]) / (state->read_count + 1);
                          break;
                     default:
-                        counter_vals[i] = counts[i];
+                        counter_vals[evt_pos] = counts[i];
                         break;
                 }
             }
         }
-        *counters = state->counters;
+        *counters = counter_vals;
 
         cuptiCheckErrors( cuptiProfilerCounterDataImageInitializePtr(&gpu_ctl->initializeParams), goto fn_fail_misc );
         cuptiCheckErrors( cuptiProfilerCounterDataImageInitializeScratchBufferPtr(&gpu_ctl->initScratchBufferParams), goto fn_fail_misc );
@@ -1734,7 +1744,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
         };
         cuptiCheckErrors( cuptiProfilerBeginPassPtr(&beginPassParams), goto fn_fail_misc );
 
-        char rangeName[64];
+        char rangeName[PAPI_MIN_STR_LEN];
         sprintf(rangeName, "PAPI_Range_%d", gpu_ctl->gpu_id);
         CUpti_Profiler_PushRange_Params pushRangeParams = {
             .structSize = CUpti_Profiler_PushRange_Params_STRUCT_SIZE,
@@ -1799,7 +1809,7 @@ int cuptip_ctx_stop(cuptip_control_t state)
 
     for (gpu_id=0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
-        if (gpu_ctl->targeted_events->count == 0) {
+        if (gpu_ctl->added_events->count == 0) {
             continue;
         }
         papi_errno = cuptic_ctxarr_get_ctx(state->info, gpu_id, &ctx);
@@ -1808,7 +1818,7 @@ int cuptip_ctx_stop(cuptip_control_t state)
         if (papi_errno != PAPI_OK) {
             goto fn_fail;
         }
-        papi_errno = cuptic_device_release(state->gpu_ctl[gpu_id].targeted_events);
+        papi_errno = cuptic_device_release(state->gpu_ctl[gpu_id].added_events);
         if (papi_errno != PAPI_OK) {
             goto fn_fail;
         }
@@ -1838,7 +1848,7 @@ int cuptip_ctx_destroy(cuptip_control_t *pstate)
     int papi_errno = nvpw_cuda_metricscontext_destroy(state);
     for (i = 0; i < num_gpus; i++) {
         reset_cupti_prof_config_images( &(state->gpu_ctl[i]) );
-        cuptiu_event_table_destroy( &(state->gpu_ctl[i].targeted_events) );
+        cuptiu_event_table_destroy( &(state->gpu_ctl[i].added_events) );
         for (j = 0; j < state->gpu_ctl[i].rmr_count; j++) {
             papi_free((void *) state->gpu_ctl[i].rmr[j].pMetricName);
         }
@@ -2029,7 +2039,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name,
     }
 
     /* check to see if capacity has been correctly allocated */
-    if (evt_table->count >= evt_table->capacity) {
+    if (*count >= evt_table->capacity) {
         return PAPI_EBUG;
     }
 
@@ -2048,7 +2058,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name,
             return PAPI_ESYS;
         }
     }
-    
+ 
     cuptiu_dev_set(&event->device_map, gpu_id);
 
     return PAPI_OK;
@@ -2366,7 +2376,6 @@ int cuptip_evt_code_to_descr(uint64_t event_code, char *descr, int len)
 */
 int cuptip_evt_name_to_code(const char *name, uint64_t *event_code)
 {
-
     int htable_errno, device, flags, nameid, papi_errno = PAPI_OK;
     cuptiu_event_t *event;
     char base[PAPI_MAX_STR_LEN] = { 0 };
@@ -2421,7 +2430,7 @@ int cuptip_evt_name_to_code(const char *name, uint64_t *event_code)
   *   Maximum alloted characters for base Cuda native event name. 
 */
 int cuptip_evt_code_to_name(uint64_t event_code, char *name, int len)
-{   
+{ 
     return evt_code_to_name(event_code, name, len);
 }
 
diff --git a/src/components/cuda/cupti_utils.h b/src/components/cuda/cupti_utils.h
index 1c8c8d1a9..17c186111 100644
--- a/src/components/cuda/cupti_utils.h
+++ b/src/components/cuda/cupti_utils.h
@@ -35,8 +35,9 @@ typedef struct gpu_record_s {
 typedef struct event_table_s {
     int count;
     unsigned int capacity;
-    char added_cuda_evts[30][PAPI_2MAX_STR_LEN];
-    int added_cuda_dev[30];
+    char cuda_evts[30][PAPI_2MAX_STR_LEN];
+    int cuda_devs[30];
+    int evt_pos[30];
     gpu_record_t *avail_gpu_info;
     cuptiu_event_t *events;
     void *htable;
diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c
index 37e173864..2a6f8485c 100644
--- a/src/components/cuda/linux-cuda.c
+++ b/src/components/cuda/linux-cuda.c
@@ -171,6 +171,8 @@ static int cuda_init_private(void)
 
     _papi_hwi_lock(COMPONENT_LOCK);
     SUBDBG("ENTER\n");
+    if (_cuda_vector.cmp_info.initialized) goto fn_exit;
+    SUBDBG("Proceeding\n");
 
     papi_errno = cuptid_init();
     if (papi_errno != PAPI_OK) {
@@ -201,10 +203,12 @@ static int cuda_init_private(void)
 
 static int check_n_initialize(void)
 {
+
+    //_papi_hwi_lock(COMPONENT_LOCK); 
     if (!_cuda_vector.cmp_info.initialized) {
         return cuda_init_private();
     }
-
+    //_papi_hwi_unlock(COMPONENT_LOCK);
     return _cuda_vector.cmp_info.disabled;
 }
 
@@ -215,9 +219,11 @@ static int cuda_ntv_enum_events(unsigned int *event_code, int modifier)
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
     }
-    
+   
     uint64_t code = *(uint64_t *) event_code;
+    //_papi_hwi_lock(COMPONENT_LOCK);
     papi_errno = cuptid_evt_enum(&code, modifier);
+    //_papi_hwi_unlock(COMPONENT_LOCK);
     *event_code = (unsigned int) code;
     
 fn_exit:
@@ -233,9 +239,11 @@ static int cuda_ntv_name_to_code(const char *name, unsigned int *event_code)
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
     }
-    
+   
     uint64_t code;
+    //_papi_hwi_lock(COMPONENT_LOCK);
     papi_errno = cuptid_evt_name_to_code(name, &code);
+    //_papi_hwi_unlock(COMPONENT_LOCK);
     *event_code = (unsigned int) code;
 
     fn_exit:
@@ -269,7 +277,9 @@ static int cuda_ntv_code_to_descr(unsigned int event_code, char *descr, int len)
         goto fn_fail;
     }
 
+    //_papi_hwi_lock(COMPONENT_LOCK);
     papi_errno = cuptid_evt_code_to_descr((uint64_t) event_code, descr, len);
+    //_papi_hwi_unlock(COMPONENT_LOCK);
 
 fn_exit:
     SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno));
@@ -286,7 +296,7 @@ static int cuda_ntv_code_to_info(unsigned int event_code, PAPI_event_info_t *inf
         goto fn_fail;
     }
 
-    papi_errno = cuptid_evt_code_to_info((uint64_t) event_code, info); 
+    papi_errno = cuptid_evt_code_to_info((uint64_t) event_code, info);
 
 fn_exit:
     SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno));
@@ -317,7 +327,8 @@ static int cuda_shutdown_thread(hwd_context_t *ctx)
 static int cuda_init_control_state(hwd_control_state_t __attribute__((unused)) *ctl)
 {
     COMPDBG("Entering.\n");
-    return PAPI_OK;
+    //return PAPI_OK;
+    return check_n_initialize();
 }
 
 static int cuda_set_domain(hwd_control_state_t __attribute__((unused)) *ctrl, int domain)
@@ -360,7 +371,7 @@ static int cuda_update_control_state(hwd_control_state_t *ctl, NativeInfo_t *ntv
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
     }
-   
+    printf("ntv_count: %d\n", ntv_count); 
     /* needed to make sure multipass events are caught with proper error code (PAPI_EMULPASS)*/
     papi_errno = cuptid_ctx_create(cuda_ctl->info, &(cuda_ctl->cuptid_ctx), cuda_ctl->events_id, cuda_ctl->num_events);
 fn_exit:
@@ -403,7 +414,7 @@ int update_native_events(cuda_control_t *ctl, NativeInfo_t *ntv_info,
         sorted_events[i].frontend_idx = i;
     }
 
-    qsort(sorted_events, ntv_count, sizeof(struct event_map_item), compare);
+    //qsort(sorted_events, ntv_count, sizeof(struct event_map_item), compare);
 
     for (i = 0; i < ntv_count; ++i) {
         ctl->events_id[i] = sorted_events[i].event_id;
diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c
index f3aeb9e08..ca4b953d6 100644
--- a/src/components/cuda/papi_cupti_common.c
+++ b/src/components/cuda/papi_cupti_common.c
@@ -619,6 +619,8 @@ int cuptic_ctxarr_update_current(cuptic_info_t info)
         return PAPI_EMISC;
     }
 
+    printf("gpu_id inside update_current is: %d\n", gpu_id);
+
     /* return cuda context bound to the calling CPU thread */
     cuda_err = cuCtxGetCurrentPtr(&pctx);
     if (cuda_err != cudaSuccess) {
@@ -636,6 +638,7 @@ int cuptic_ctxarr_update_current(cuptic_info_t info)
         }
         /* cuda context not found for calling CPU thread */
         else {
+            printf("We create a cuda context.\n");
             cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC);
             cudaCheckErrors(cuCtxGetCurrentPtr(&info[gpu_id].ctx), return PAPI_EMISC);
             LOGDBG("Using primary device context %p for device %d.\n", info[gpu_id].ctx, gpu_id);
@@ -654,6 +657,13 @@ int cuptic_ctxarr_update_current(cuptic_info_t info)
 int cuptic_ctxarr_get_ctx(cuptic_info_t info, int gpu_idx, CUcontext *ctx)
 {
     *ctx = info[gpu_idx].ctx;
+    printf("gpu_idx: %d\n", gpu_idx);
+    if (*ctx == NULL) {
+        printf("ctx is null.\n");
+        *ctx = info[0].ctx;
+        if (*ctx != NULL) printf("Gpu id 0 is not null.\n"); 
+    }
+     
     return PAPI_OK;
 }
 
@@ -699,7 +709,7 @@ static int _devmask_events_get(cuptiu_event_table_t *evt_table, gpu_occupancy_t
     gpu_occupancy_t acq_mask = 0;
     cuptiu_event_t *evt_rec;
     for (i = 0; i < evt_table->count; i++) {
-        acq_mask |= (1 << evt_table->added_cuda_dev[i]);
+        acq_mask |= (1 << evt_table->cuda_devs[i]);
     }
     *bitmask = acq_mask;
 fn_exit:

From d54558d17c615f8b5c9260e4c1a6162b3bbb559c Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Wed, 5 Feb 2025 13:15:21 +0000
Subject: [PATCH 08/16] Support for heterogeneous gpu configurations final.

---
 src/components/cuda/cupti_profiler.c    | 37 ++++++++++++-------------
 src/components/cuda/linux-cuda.c        | 18 ++----------
 src/components/cuda/papi_cupti_common.c | 10 -------
 3 files changed, 20 insertions(+), 45 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index 873c80f65..b754b5c4a 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -20,7 +20,6 @@
 #include "cupti_config.h"
 #include "lcuda_debug.h"
 #include "htable.h"
-#include <threads.h>
 
 /**
  * Event identifier encoding format:
@@ -90,9 +89,9 @@ static void *dl_nvpw;
 static int num_gpus;
 static gpu_record_t *avail_gpu_info;
 
+/* main event table to store metrics */
 static cuptiu_event_table_t *cuptiu_table_p;
 
-
 /* load and unload cuda function pointers */
 static int load_cupti_perf_sym(void);
 static int unload_cupti_perf_sym(void);
@@ -154,7 +153,6 @@ static int get_event_collection_method(const char *evt_name);
 static int get_added_events_rmr(cuptip_gpu_state_t *gpu_ctl);
 static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl);
 static int get_measured_values(cuptip_gpu_state_t *gpu_ctl, long long *counts);
-static int num_unique_devs(int num_gpus);
 
 /* nvperf function pointers */
 NVPA_Status ( *NVPW_GetSupportedChipNamesPtr ) (NVPW_GetSupportedChipNames_Params* params);
@@ -904,10 +902,12 @@ static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
 static int metric_get_counter_data_prefix_image(cuptip_gpu_state_t *gpu_ctl)
 {
     COMPDBG("Entering.\n");
+    int gpu_id = gpu_ctl->gpu_id;
+
     NVPW_CounterDataBuilder_Create_Params counterDataBuilderCreateParams = {
         .structSize = NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE,
         .pPriv = NULL,
-        .pChipName = cuptiu_table_p->avail_gpu_info[gpu_ctl->gpu_id].chip_name,
+        .pChipName = cuptiu_table_p->avail_gpu_info[gpu_id].chip_name,
     };
     nvpwCheckErrors( NVPW_CounterDataBuilder_CreatePtr(&counterDataBuilderCreateParams), goto fn_fail );
 
@@ -1111,7 +1111,8 @@ static int begin_profiling(cuptip_gpu_state_t *gpu_ctl)
     cuptiCheckErrors( cuptiProfilerEnableProfilingPtr(&enableProfilingParams), goto fn_fail );
 
     char rangeName[PAPI_MIN_STR_LEN];
-    sprintf(rangeName, "PAPI_Range_%d", gpu_ctl->gpu_id);
+    int gpu_id = gpu_ctl->gpu_id;
+    sprintf(rangeName, "PAPI_Range_%d", gpu_id);
     CUpti_Profiler_PushRange_Params pushRangeParams = {
         .structSize = CUpti_Profiler_PushRange_Params_STRUCT_SIZE,
         .pPriv = NULL,
@@ -1366,12 +1367,13 @@ static void init_main_htable(void)
 {
     int i, val = 1, base = 2;
 
-    /* allocate 2 ^ 21 metric names, this matches the number of bits for the event encoding format */
+    /* allocate (2 ^ NAMEID_WIDTH) metric names, this matches the 
+       number of bits for the event encoding format */
     for (i = 0; i < NAMEID_WIDTH; i++) {
         val *= base;
     }    
    
-    /* allocate memory */ 
+    /* initialize struct */ 
     cuptiu_table_p = papi_malloc(sizeof(cuptiu_event_table_t));
     cuptiu_table_p->capacity = val; 
     cuptiu_table_p->count = 0; 
@@ -1415,14 +1417,14 @@ int cuptip_init(void)
         goto fn_fail;
     }
 
-    /* initialize the main event table for metric collection */
     init_main_htable();    
 
     papi_errno = init_all_metrics();
     if (papi_errno != PAPI_OK) {
         goto fn_fail;
     }
-    /* initialize hash table with cuda native events */
+
+    /* collect metrics */
     init_event_table();
 
     papi_errno = cuInitPtr(0);
@@ -1471,7 +1473,7 @@ int verify_events(uint64_t *events_id, int num_events,
             return papi_errno;
         }
  
-        /* for the current device table get the next event index  */
+        /* for a specific device table, get the current event index */
         idx = state->gpu_ctl[info.device].added_events->count; 
 
         metricName = state->gpu_ctl[info.device].added_events->cuda_evts[idx];
@@ -1522,7 +1524,6 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
         return PAPI_ENOMEM;
     }
 
-    /* for each unique gpu store the gpu id for that gpu index */
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         state->gpu_ctl[gpu_id].gpu_id = gpu_id;
     }
@@ -1533,13 +1534,13 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
         goto fn_exit;
     }
 
-    /* creates a MetricsContext */
+    /* create a MetricsContext */
     papi_errno = nvpw_cuda_metricscontext_create(state);
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
     }
 
-    /* verify user added events are available on the system */
+    /* verify user added events are available on the machine */
     papi_errno = verify_events(events_id, num_events, state);
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
@@ -1646,8 +1647,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
 {
     COMPDBG("Entering.\n");
     int papi_errno, gpu_id, i, j = 0, method, evt_pos;
-    long long counts[30];
-    long long *counter_vals = state->counters;
+    long long counts[30], *counter_vals = state->counters;
     cuptip_gpu_state_t *gpu_ctl = NULL;
     CUcontext userCtx = NULL, ctx = NULL;
 
@@ -1698,14 +1698,11 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
         }
 
         for (i = 0; i < gpu_ctl->added_events->count; i++) {
-            printf("counts value: %d and gpu_id: %d\n", counts[i], gpu_id);
-            printf("evt_pos: %d\n", gpu_ctl->added_events->evt_pos[i]);
             evt_pos = gpu_ctl->added_events->evt_pos[i];
             if (state->read_count == 0) {
                 counter_vals[evt_pos] = counts[i];
             }
             else {
-                printf("WE ENTER ELSE STATEMENT.\n");
                 /* determine collection method such as max, min, sum, and avg for an added Cuda native event */
                 method = get_event_collection_method(gpu_ctl->added_events->cuda_evts[i]);
                 switch (method) {
@@ -2031,7 +2028,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name,
     int papi_errno;
     char description[256];
     int *count = &evt_table->count;
-    cuptiu_event_t *events = evt_table->events;    
+    cuptiu_event_t *events = evt_table->events;
 
     /* check to see if evt_name argument has been provided */
     if (evt_name == NULL) {
@@ -2058,7 +2055,7 @@ static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name,
             return PAPI_ESYS;
         }
     }
- 
+
     cuptiu_dev_set(&event->device_map, gpu_id);
 
     return PAPI_OK;
diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c
index 2a6f8485c..22c508fd0 100644
--- a/src/components/cuda/linux-cuda.c
+++ b/src/components/cuda/linux-cuda.c
@@ -143,10 +143,9 @@ static int cuda_init_component(int cidx)
     _cuda_vector.cmp_info.num_native_events = -1;
     _cuda_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;
 
-    //_cuda_vector.cmp_info.initialized = 1;
     _cuda_vector.cmp_info.disabled = PAPI_EDELAY_INIT;
     sprintf(_cuda_vector.cmp_info.disabled_reason,
-        "Not initialized. Access component events to initialize it.");
+            "Not initialized. Access component events to initialize it.");
     return PAPI_EDELAY_INIT;
 }
 
@@ -172,7 +171,6 @@ static int cuda_init_private(void)
     _papi_hwi_lock(COMPONENT_LOCK);
     SUBDBG("ENTER\n");
     if (_cuda_vector.cmp_info.initialized) goto fn_exit;
-    SUBDBG("Proceeding\n");
 
     papi_errno = cuptid_init();
     if (papi_errno != PAPI_OK) {
@@ -203,12 +201,9 @@ static int cuda_init_private(void)
 
 static int check_n_initialize(void)
 {
-
-    //_papi_hwi_lock(COMPONENT_LOCK); 
     if (!_cuda_vector.cmp_info.initialized) {
         return cuda_init_private();
     }
-    //_papi_hwi_unlock(COMPONENT_LOCK);
     return _cuda_vector.cmp_info.disabled;
 }
 
@@ -221,9 +216,7 @@ static int cuda_ntv_enum_events(unsigned int *event_code, int modifier)
     }
    
     uint64_t code = *(uint64_t *) event_code;
-    //_papi_hwi_lock(COMPONENT_LOCK);
     papi_errno = cuptid_evt_enum(&code, modifier);
-    //_papi_hwi_unlock(COMPONENT_LOCK);
     *event_code = (unsigned int) code;
     
 fn_exit:
@@ -241,9 +234,7 @@ static int cuda_ntv_name_to_code(const char *name, unsigned int *event_code)
     }
    
     uint64_t code;
-    //_papi_hwi_lock(COMPONENT_LOCK);
     papi_errno = cuptid_evt_name_to_code(name, &code);
-    //_papi_hwi_unlock(COMPONENT_LOCK);
     *event_code = (unsigned int) code;
 
     fn_exit:
@@ -277,9 +268,7 @@ static int cuda_ntv_code_to_descr(unsigned int event_code, char *descr, int len)
         goto fn_fail;
     }
 
-    //_papi_hwi_lock(COMPONENT_LOCK);
     papi_errno = cuptid_evt_code_to_descr((uint64_t) event_code, descr, len);
-    //_papi_hwi_unlock(COMPONENT_LOCK);
 
 fn_exit:
     SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno));
@@ -371,9 +360,10 @@ static int cuda_update_control_state(hwd_control_state_t *ctl, NativeInfo_t *ntv
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
     }
-    printf("ntv_count: %d\n", ntv_count); 
+
     /* needed to make sure multipass events are caught with proper error code (PAPI_EMULPASS)*/
     papi_errno = cuptid_ctx_create(cuda_ctl->info, &(cuda_ctl->cuptid_ctx), cuda_ctl->events_id, cuda_ctl->num_events);
+
 fn_exit:
     SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno));
     return papi_errno;
@@ -414,8 +404,6 @@ int update_native_events(cuda_control_t *ctl, NativeInfo_t *ntv_info,
         sorted_events[i].frontend_idx = i;
     }
 
-    //qsort(sorted_events, ntv_count, sizeof(struct event_map_item), compare);
-
     for (i = 0; i < ntv_count; ++i) {
         ctl->events_id[i] = sorted_events[i].event_id;
         ntv_info[sorted_events[i].frontend_idx].ni_position = i;
diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c
index ca4b953d6..492eb260f 100644
--- a/src/components/cuda/papi_cupti_common.c
+++ b/src/components/cuda/papi_cupti_common.c
@@ -619,8 +619,6 @@ int cuptic_ctxarr_update_current(cuptic_info_t info)
         return PAPI_EMISC;
     }
 
-    printf("gpu_id inside update_current is: %d\n", gpu_id);
-
     /* return cuda context bound to the calling CPU thread */
     cuda_err = cuCtxGetCurrentPtr(&pctx);
     if (cuda_err != cudaSuccess) {
@@ -638,7 +636,6 @@ int cuptic_ctxarr_update_current(cuptic_info_t info)
         }
         /* cuda context not found for calling CPU thread */
         else {
-            printf("We create a cuda context.\n");
             cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC);
             cudaCheckErrors(cuCtxGetCurrentPtr(&info[gpu_id].ctx), return PAPI_EMISC);
             LOGDBG("Using primary device context %p for device %d.\n", info[gpu_id].ctx, gpu_id);
@@ -657,13 +654,6 @@ int cuptic_ctxarr_update_current(cuptic_info_t info)
 int cuptic_ctxarr_get_ctx(cuptic_info_t info, int gpu_idx, CUcontext *ctx)
 {
     *ctx = info[gpu_idx].ctx;
-    printf("gpu_idx: %d\n", gpu_idx);
-    if (*ctx == NULL) {
-        printf("ctx is null.\n");
-        *ctx = info[0].ctx;
-        if (*ctx != NULL) printf("Gpu id 0 is not null.\n"); 
-    }
-     
     return PAPI_OK;
 }
 

From b3a87f402a4c59c3db04b9e9830640aba9974b53 Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Tue, 11 Feb 2025 03:06:36 +0000
Subject: [PATCH 09/16] Update the function cuptic_ctxarr_update_current to
 work with papi_command_line with various device id's appened to :device=#.

---
 src/components/cuda/cupti_profiler.c    |  8 +++-
 src/components/cuda/papi_cupti_common.c | 60 ++++++++++++-------------
 src/components/cuda/papi_cupti_common.h |  2 +-
 3 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index b754b5c4a..363bbdea6 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -1528,8 +1528,14 @@ int cuptip_ctx_create(cuptic_info_t thr_info, cuptip_control_t *pstate, uint64_t
         state->gpu_ctl[gpu_id].gpu_id = gpu_id;
     }
 
+    event_info_t info;
+    papi_errno = evt_id_to_info(events_id[num_events - 1], &info);
+    if (papi_errno != PAPI_OK) {
+        return papi_errno;
+    } 
+
     /* register the user created cuda context for the current gpu if not already known */
-    papi_errno = cuptic_ctxarr_update_current(thr_info);
+    papi_errno = cuptic_ctxarr_update_current(thr_info, info.device);
     if (papi_errno != PAPI_OK) {
         goto fn_exit;
     }
diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c
index 492eb260f..71d001bdd 100644
--- a/src/components/cuda/papi_cupti_common.c
+++ b/src/components/cuda/papi_cupti_common.c
@@ -607,46 +607,44 @@ int cuptic_ctxarr_create(cuptic_info_t *pinfo)
   *    Struct that contains a Cuda context, that can be indexed into based
   *    on device id.
 */
-int cuptic_ctxarr_update_current(cuptic_info_t info)
+int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id)
 {
     int gpu_id;
     CUcontext pctx;
     CUresult cuda_err;
+    CUdevice dev_id;
 
-    /* get device currently being used */
-    cuda_err = cudaGetDevicePtr(&gpu_id);
-    if (cuda_err != cudaSuccess) {
-        return PAPI_EMISC;
-    }
-
-    /* return cuda context bound to the calling CPU thread */
+    // See if a user created a CUDA context on the
+    // calling cpu thread.
     cuda_err = cuCtxGetCurrentPtr(&pctx);
-    if (cuda_err != cudaSuccess) {
-        return PAPI_EMISC;
-    }
-    /* check to see if Cuda context exists for device  */
-    if (info[gpu_id].ctx == NULL) {
-        /* cuda context found for the calling CPU thread */
-        if (pctx != NULL) {
-            LOGDBG("Registering device = %d with ctx = %p.\n", gpu_id, pctx);
-            /* store current context into struct */
-            cuda_err = cuCtxGetCurrentPtr(&info[gpu_id].ctx);
-            if (cuda_err != cudaSuccess)
+    if (cuda_err == CUDA_SUCCESS && pctx != NULL) {
+        // Get the device id associated with the user created CUDA context
+        cuda_err = cuCtxGetDevicePtr(&dev_id);
+        if (cuda_err != CUDA_SUCCESS) {
+            return PAPI_EMISC;
+        }
+
+        if (info[dev_id].ctx == NULL) {
+            // Store current user created CUDA context
+            cuda_err = cuCtxGetCurrentPtr(&info[dev_id].ctx);
+            if (cuda_err != CUDA_SUCCESS) {
                 return PAPI_EMISC;
+            }
         }
-        /* cuda context not found for calling CPU thread */
-        else {
-            cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC);
-            cudaCheckErrors(cuCtxGetCurrentPtr(&info[gpu_id].ctx), return PAPI_EMISC);
-            LOGDBG("Using primary device context %p for device %d.\n", info[gpu_id].ctx, gpu_id);
+        else if (info[dev_id].ctx != pctx) {
+            ERRDBG("Warning: cuda context for gpu %d has changed from %p to %p\n", gpu_id, info[gpu_id].ctx, pctx);
         }
-    }
-
-    /* if context exists then see if it has changed; if it has then keep the first
-       seen one, but show warning */
-    else if (info[gpu_id].ctx != pctx) {
-        ERRDBG("Warning: cuda context for gpu %d has changed from %p to %p\n", gpu_id, info[gpu_id].ctx, pctx);
-    }
+     }
+     // If a user did not create a CUDA context, then we will create one 
+     // for them. Note, that for machine with multiple devices, we need to
+     // call cudaSetDevice.
+     else {
+         cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC);
+         cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC);
+
+         cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC);
+         cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC);
+     }
 
     return PAPI_OK;
 }
diff --git a/src/components/cuda/papi_cupti_common.h b/src/components/cuda/papi_cupti_common.h
index 13a30828f..398d75267 100644
--- a/src/components/cuda/papi_cupti_common.h
+++ b/src/components/cuda/papi_cupti_common.h
@@ -65,7 +65,7 @@ int cuptic_shutdown(void);
 
 /* context management interfaces */
 int cuptic_ctxarr_create(cuptic_info_t *pinfo);
-int cuptic_ctxarr_update_current(cuptic_info_t info);
+int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id);
 int cuptic_ctxarr_get_ctx(cuptic_info_t info, int gpu_idx, CUcontext *ctx);
 int cuptic_ctxarr_destroy(cuptic_info_t *pinfo);
 

From e355daab784df0546a5f5073e638ee9d35761a02 Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Wed, 12 Feb 2025 18:01:28 +0000
Subject: [PATCH 10/16] Update function to append default device qualifier.

---
 src/components/cuda/cupti_profiler.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index 363bbdea6..c567a9ac6 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -2583,8 +2583,9 @@ static int evt_name_to_basename(const char *name, char *base, int len)
 }
 
 /** @class evt_name_to_device
-  * @brief Take a Cuda native event name with a device qualifer appended to 
-  *        it and collect the device number.
+  * @brief Return the device number for a user provided Cuda native event.
+  *        This can be done with a device qualifier present (:device=#) or
+  *        we internally find the first device the native event exists for.
   * @param *name
   *   Cuda native event name with a device qualifier appended.
   * @param *device
@@ -2593,11 +2594,27 @@ static int evt_name_to_basename(const char *name, char *base, int len)
 static int evt_name_to_device(const char *name, int *device)
 {
     char *p = strstr(name, ":device=");
+    // User did provide :device=# qualifier
     if (p) {
         *device = (int) strtol(p + strlen(":device="), NULL, 10);
     }
+    // User did not provide :device=# qualifier
     else {
-        *device = 0;
+        int i, htable_errno;
+        cuptiu_event_t *event;
+
+        htable_errno = htable_find(cuptiu_table_p->htable, name, (void **) &event);
+        if (htable_errno != HTABLE_SUCCESS) {
+            return PAPI_EINVAL;
+        }
+
+        // Search for the first device the event exists for.
+        for (i = 0; i < num_gpus; ++i) {
+            if (cuptiu_dev_check(event->device_map, i)) {
+                *device = i;
+                break;
+            }
+        }
     }
     return PAPI_OK;
 }

From 1bdd85bec6bbf149cbd1fb761e82b16a20c441fc Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Wed, 12 Feb 2025 19:01:35 +0000
Subject: [PATCH 11/16] Fix indexing in init_event_table to free allocated
 memory.

---
 src/components/cuda/cupti_profiler.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index c567a9ac6..44a626681 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -2004,7 +2004,7 @@ int init_event_table(void)
         NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = {
             .structSize = NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE,
             .pPriv = NULL,
-            .pMetricsContext = cuptiu_table_p->avail_gpu_info[gpu_idx].pmetricsContextCreateParams->pMetricsContext,
+            .pMetricsContext = cuptiu_table_p->avail_gpu_info[table_idx].pmetricsContextCreateParams->pMetricsContext,
         };
         nvpwCheckErrors( NVPW_MetricsContext_GetMetricNames_EndPtr((NVPW_MetricsContext_GetMetricNames_End_Params *) &getMetricNameEndParams), goto fn_fail );
     }

From cb47152c8c2db8728bae1fc1b9c10de6f80cac1e Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Fri, 14 Feb 2025 13:46:46 +0000
Subject: [PATCH 12/16] Add guard against a user PopCurrent and additional
 check for snprintf.

---
 src/components/cuda/linux-cuda.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c
index 22c508fd0..4b1cf549c 100644
--- a/src/components/cuda/linux-cuda.c
+++ b/src/components/cuda/linux-cuda.c
@@ -170,14 +170,18 @@ static int cuda_init_private(void)
 
     _papi_hwi_lock(COMPONENT_LOCK);
     SUBDBG("ENTER\n");
-    if (_cuda_vector.cmp_info.initialized) goto fn_exit;
+
+    if (_cuda_vector.cmp_info.initialized) {
+        SUBDBG("Skipping cuda_init_private, as the Cuda event table has already been initialized.\n");
+        goto fn_exit;
+    }
 
     papi_errno = cuptid_init();
     if (papi_errno != PAPI_OK) {
         /* get and assign the string literal for the disabled reason */
         cuptid_disabled_reason_get(&disabled_reason);
         len = snprintf(_cuda_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", disabled_reason);
-        if (len > PAPI_MAX_STR_LEN) {
+        if (len < 0 || len > PAPI_MAX_STR_LEN) {
             SUBDBG("The disabled reason has been truncated.\n");
         }
         goto fn_fail;

From fe2a676864924af1a62482f83acf456b237e7f4d Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Fri, 14 Feb 2025 13:47:30 +0000
Subject: [PATCH 13/16] Add file changes from papi_cupti_common.

---
 src/components/cuda/papi_cupti_common.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c
index 71d001bdd..3d74b0635 100644
--- a/src/components/cuda/papi_cupti_common.c
+++ b/src/components/cuda/papi_cupti_common.c
@@ -623,7 +623,6 @@ int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id)
         if (cuda_err != CUDA_SUCCESS) {
             return PAPI_EMISC;
         }
-
         if (info[dev_id].ctx == NULL) {
             // Store current user created CUDA context
             cuda_err = cuCtxGetCurrentPtr(&info[dev_id].ctx);
@@ -639,11 +638,14 @@ int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id)
      // for them. Note, that for machine with multiple devices, we need to
      // call cudaSetDevice.
      else {
-         cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC);
-         cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC);
-
-         cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC);
-         cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC);
+         // Guard against a user PopCurrent
+         if (info[evt_dev_id].ctx == NULL) {
+             cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC);
+             cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC);
+
+             cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC);
+             cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC);
+         }
      }
 
     return PAPI_OK;

From dbd3a5e579eeea33c54bcbfe163796bec63cbedc Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Fri, 14 Feb 2025 15:19:25 +0000
Subject: [PATCH 14/16] Syntax changes/clean up.

---
 src/components/cuda/cupti_profiler.c | 10 ++++------
 src/components/cuda/cupti_utils.h    |  1 -
 src/components/cuda/linux-cuda.c     |  1 -
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index 44a626681..49dc061a1 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -1601,7 +1601,7 @@ int cuptip_ctx_start(cuptip_control_t state)
             ERRDBG("Profiling same gpu from multiple event sets not allowed.\n");
             return papi_errno;
         }
-        /* get the cuda context  */
+        /* get the cuda context */
         papi_errno = cuptic_ctxarr_get_ctx(state->info, gpu_id, &ctx);
         /* bind the specified CUDA context to the calling CPU thread */
         cudaCheckErrors( cuCtxSetCurrentPtr(ctx), goto fn_fail_misc );
@@ -1913,11 +1913,9 @@ int cuptip_shutdown(void)
 */
 int evt_id_create(event_info_t *info, uint64_t *event_id)
 {
-
     *event_id  = (uint64_t)(info->device   << DEVICE_SHIFT);
     *event_id |= (uint64_t)(info->flags    << QLMASK_SHIFT);
     *event_id |= (uint64_t)(info->nameid   << NAMEID_SHIFT);
-
     return PAPI_OK;
 }
 
@@ -1955,8 +1953,8 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info)
 */
 int init_event_table(void) 
 {
-    int dev_id, found, table_idx = 0; 
-    int gpu_idx, i, listsubmetrics = 1, papi_errno = PAPI_OK;
+    int i, dev_id, found, table_idx = 0, papi_errno = PAPI_OK;
+    int listsubmetrics = 1
 
     /* instatiate struct to collect the total metric count and metric names;
        instantiated here to avoid scoping issues */
@@ -2012,6 +2010,7 @@ int init_event_table(void)
   fn_exit:
     return papi_errno;
   fn_fail:
+    papi_errno = PAPI_EMISC; 
     goto fn_exit;
 
 }
@@ -2488,7 +2487,6 @@ static int evt_code_to_name(uint64_t event_code, char *name, int len)
 */
 int cuptip_evt_code_to_info(uint64_t event_code, PAPI_event_info_t *info)
 {
-
     int papi_errno, i, gpu_id;
     char description[PAPI_HUGE_STR_LEN];
 
diff --git a/src/components/cuda/cupti_utils.h b/src/components/cuda/cupti_utils.h
index 17c186111..742bda402 100644
--- a/src/components/cuda/cupti_utils.h
+++ b/src/components/cuda/cupti_utils.h
@@ -14,7 +14,6 @@
 
 #include <stdint.h>
 
-
 typedef int64_t cuptiu_bitmap_t;
 typedef int (*cuptiu_dev_get_map_cb)(uint64_t event_id, int *dev_id);
 typedef NVPW_CUDA_MetricsContext_Create_Params MCCP_t;
diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c
index 4b1cf549c..3142d4d26 100644
--- a/src/components/cuda/linux-cuda.c
+++ b/src/components/cuda/linux-cuda.c
@@ -320,7 +320,6 @@ static int cuda_shutdown_thread(hwd_context_t *ctx)
 static int cuda_init_control_state(hwd_control_state_t __attribute__((unused)) *ctl)
 {
     COMPDBG("Entering.\n");
-    //return PAPI_OK;
     return check_n_initialize();
 }
 

From d701c84dd6baebbf532effb1c3e5540a13de4840 Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Fri, 14 Feb 2025 15:45:38 +0000
Subject: [PATCH 15/16] Forgot a semicolon.

---
 src/components/cuda/cupti_profiler.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index 49dc061a1..6dae222fb 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -1954,7 +1954,7 @@ int evt_id_to_info(uint64_t event_id, event_info_t *info)
 int init_event_table(void) 
 {
     int i, dev_id, found, table_idx = 0, papi_errno = PAPI_OK;
-    int listsubmetrics = 1
+    int listsubmetrics = 1;
 
     /* instatiate struct to collect the total metric count and metric names;
        instantiated here to avoid scoping issues */

From 72443c62b0e0c796a2e2c6dccd9b8932c269c2c2 Mon Sep 17 00:00:00 2001
From: Treece Burgess <tburgess@icl.utk.edu>
Date: Thu, 20 Feb 2025 19:47:56 +0000
Subject: [PATCH 16/16] Update Cuda context creation

---
 src/components/cuda/cupti_profiler.c    | 79 ++++++++++++-------------
 src/components/cuda/linux-cuda.c        |  1 -
 src/components/cuda/papi_cupti_common.c | 48 ++++++++-------
 3 files changed, 65 insertions(+), 63 deletions(-)

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
index 6dae222fb..73a094a47 100644
--- a/src/components/cuda/cupti_profiler.c
+++ b/src/components/cuda/cupti_profiler.c
@@ -107,7 +107,7 @@ static int initialize_perfworks_api(void);
 
 /* utility functions to init metrics and cuda native event table */
 static int init_all_metrics(void);
-static void init_main_htable(void);
+static int init_main_htable(void);
 static int init_event_table(void);
 static int shutdown_event_table(void);
 static void free_all_enumerated_metrics(void);
@@ -136,7 +136,7 @@ static int calculate_num_passes(struct NVPA_RawMetricsConfig *pRawMetricsConfig,
                                 NVPA_RawMetricRequest *rmr, int *num_pass);
 
 /* functions to set and get cuda native event info  or convert cuda native events  */
-static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos, int gpu_id);
+static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, int gpu_id);
 static int verify_events(uint64_t *events_id, int num_events, cuptip_control_t state);
 static int evt_id_to_info(uint64_t event_id, event_info_t *info);
 static int evt_id_create(event_info_t *info, uint64_t *event_id);
@@ -179,7 +179,6 @@ NVPA_Status ( *NVPW_Profiler_CounterData_GetRangeDescriptionsPtr ) (NVPW_Profile
 NVPA_Status ( *NVPW_MetricsContext_SetCounterDataPtr ) (NVPW_MetricsContext_SetCounterData_Params* params);
 NVPA_Status ( *NVPW_MetricsContext_EvaluateToGpuValuesPtr ) (NVPW_MetricsContext_EvaluateToGpuValues_Params* params);
 NVPA_Status ( *NVPW_RawMetricsConfig_GetNumPassesPtr ) (NVPW_RawMetricsConfig_GetNumPasses_Params* params);
-NVPA_Status ( *NVPW_RawMetricsConfig_GetNumPassesPtr_V2 ) (NVPW_RawMetricsConfig_GetNumPasses_V2_Params* params);
 NVPA_Status ( *NVPW_RawMetricsConfig_SetCounterAvailabilityPtr ) (NVPW_RawMetricsConfig_SetCounterAvailability_Params* params);
 NVPA_Status ( *NVPW_RawMetricsConfig_IsAddMetricsPossiblePtr ) (NVPW_RawMetricsConfig_IsAddMetricsPossible_Params* params);
 NVPA_Status ( *NVPW_MetricsContext_GetCounterNames_BeginPtr ) (NVPW_MetricsContext_GetCounterNames_Begin_Params* pParams);
@@ -1289,11 +1288,7 @@ static int find_same_chipname(int gpu_id)
 static int init_all_metrics(void)
 {
     int gpu_id, papi_errno = PAPI_OK;
-    cuptiu_table_p->avail_gpu_info = (gpu_record_t *) papi_calloc(num_gpus, sizeof(gpu_record_t));
-    if (cuptiu_table_p->avail_gpu_info == NULL) {
-        papi_errno = PAPI_ENOMEM;
-        goto fn_exit;
-    }
+
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         papi_errno = get_chip_name(gpu_id, cuptiu_table_p->avail_gpu_info[gpu_id].chip_name);
         if (papi_errno != PAPI_OK) {
@@ -1363,9 +1358,9 @@ static void free_all_enumerated_metrics(void)
 /** @class init_main_htable
  *  @brief Initialize the main htable used to collect metrics.
 */
-static void init_main_htable(void) 
+static int init_main_htable(void)
 {
-    int i, val = 1, base = 2;
+    int i, val = 1, base = 2, papi_errno = PAPI_OK;
 
     /* allocate (2 ^ NAMEID_WIDTH) metric names, this matches the 
        number of bits for the event encoding format */
@@ -1374,13 +1369,31 @@ static void init_main_htable(void)
     }    
    
     /* initialize struct */ 
-    cuptiu_table_p = papi_malloc(sizeof(cuptiu_event_table_t));
+    cuptiu_table_p = (cuptiu_event_table_t *) papi_malloc(sizeof(cuptiu_event_table_t));
+    if (cuptiu_table_p == NULL) {
+        goto fn_fail;
+    }
     cuptiu_table_p->capacity = val; 
-    cuptiu_table_p->count = 0; 
-    cuptiu_table_p->events = papi_calloc(val, sizeof(cuptiu_event_t)); 
-   
+    cuptiu_table_p->count = 0;
+
+    cuptiu_table_p->events = (cuptiu_event_t *) papi_calloc(val, sizeof(cuptiu_event_t));
+    if (cuptiu_table_p->events == NULL) {
+        goto fn_fail;
+    }
+
+    cuptiu_table_p->avail_gpu_info = (gpu_record_t *) papi_calloc(num_gpus, sizeof(gpu_record_t));
+    if (cuptiu_table_p->avail_gpu_info == NULL) {
+        goto fn_fail;
+    }
+
     /* initialize the main hash table for metric collection */ 
     htable_init(&cuptiu_table_p->htable);
+
+  fn_exit:
+    return papi_errno;
+  fn_fail:
+    papi_errno = PAPI_ENOMEM;
+    goto fn_exit;
 }
 
 /** @class cuptip_init
@@ -1417,7 +1430,10 @@ int cuptip_init(void)
         goto fn_fail;
     }
 
-    init_main_htable();    
+    papi_errno = init_main_htable();
+    if (papi_errno != PAPI_OK) {
+        goto fn_fail;
+    }
 
     papi_errno = init_all_metrics();
     if (papi_errno != PAPI_OK) {
@@ -1581,13 +1597,8 @@ int cuptip_ctx_start(cuptip_control_t state)
     /* create a context handle */
     CUcontext userCtx, ctx;
 
-    /* return the Cuda context bound to the calling CPU thread */
+    // return the Cuda context bound to the calling CPU thread
     cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc );
-    /* if no context is found, create a context */
-    if (userCtx == NULL) {
-        cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc );
-        cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc );
-    }
 
     /* enumerate through all of the unique gpus */
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
@@ -1658,10 +1669,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
     CUcontext userCtx = NULL, ctx = NULL;
 
     cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc );
-    if (userCtx == NULL) {
-        cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc );
-        cudaArtCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc );
-    }
+
     for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
         if (gpu_ctl->added_events->count == 0) {
@@ -1704,6 +1712,7 @@ int cuptip_ctx_read(cuptip_control_t state, long long **counters)
         }
 
         for (i = 0; i < gpu_ctl->added_events->count; i++) {
+            printf("Device id: %d and counts: %d\n", gpu_id, counts[i]);
             evt_pos = gpu_ctl->added_events->evt_pos[i];
             if (state->read_count == 0) {
                 counter_vals[evt_pos] = counts[i];
@@ -1780,7 +1789,7 @@ int cuptip_ctx_reset(cuptip_control_t state)
     int i;
 
     for (i = 0; i < state->read_count; i++) {
-        state->counters[i] = 1;
+        state->counters[i] = 0;
     }
 
     state->read_count = 0;
@@ -1802,13 +1811,7 @@ int cuptip_ctx_stop(cuptip_control_t state)
     cuptip_gpu_state_t *gpu_ctl;
     CUcontext userCtx = NULL, ctx = NULL;
 
-    
-
     cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc );
-    if (userCtx == NULL) {
-        cudaArtCheckErrors( cudaFreePtr(NULL), goto fn_fail_misc );
-        cudaCheckErrors( cuCtxGetCurrentPtr(&userCtx), goto fn_fail_misc );
-    }
 
     for (gpu_id=0; gpu_id < num_gpus; gpu_id++) {
         gpu_ctl = &(state->gpu_ctl[gpu_id]);
@@ -1990,7 +1993,7 @@ int init_event_table(void)
 
         /* loop through metrics to add to overall event table */
         for (i = 0; i < cuptiu_table_p->avail_gpu_info[table_idx].num_metrics; i++) {
-            papi_errno = get_ntv_events( cuptiu_table_p, cuptiu_table_p->avail_gpu_info[table_idx].metric_names[i], i, 0, dev_id);
+            papi_errno = get_ntv_events( cuptiu_table_p, cuptiu_table_p->avail_gpu_info[table_idx].metric_names[i], dev_id);
             if (papi_errno != PAPI_OK)
                 goto fn_exit;
         }
@@ -2016,22 +2019,16 @@ int init_event_table(void)
 }
 
 /** @class get_ntv_events
-  * @brief Add the event name, event code, and event position to the hash table.
+  * @brief Store Cuda native events and their corresponding device(s).
   *
   * @param *evt_table
   *   Structure containing member variables such as name, evt_code, evt_pos,
       and htable.
   * @param *evt_name
   *   Cuda native event name.
-  * @param evt_code
-  *   Event code which corresponds to the Cuda native event name.
-  * @param evt_pos
-  *   Position within the hash table. 
 */
-static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos, int gpu_id) 
+static int get_ntv_events(cuptiu_event_table_t *evt_table, const char *evt_name, int gpu_id) 
 {
-    int papi_errno;
-    char description[256];
     int *count = &evt_table->count;
     cuptiu_event_t *events = evt_table->events;
 
diff --git a/src/components/cuda/linux-cuda.c b/src/components/cuda/linux-cuda.c
index 3142d4d26..35d153808 100644
--- a/src/components/cuda/linux-cuda.c
+++ b/src/components/cuda/linux-cuda.c
@@ -515,7 +515,6 @@ static int cuda_reset(hwd_context_t __attribute__((unused)) *ctx, hwd_control_st
         return PAPI_EMISC;
     }
 
-    /* To-do: Understand how this connects to values, memory addresses are not the same. */
     papi_errno = cuptid_ctx_reset(cuda_ctl->cuptid_ctx);
      
     return papi_errno;
diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c
index 3d74b0635..e11b6e2f5 100644
--- a/src/components/cuda/papi_cupti_common.c
+++ b/src/components/cuda/papi_cupti_common.c
@@ -609,44 +609,50 @@ int cuptic_ctxarr_create(cuptic_info_t *pinfo)
 */
 int cuptic_ctxarr_update_current(cuptic_info_t info, int evt_dev_id)
 {
-    int gpu_id;
     CUcontext pctx;
     CUresult cuda_err;
     CUdevice dev_id;
 
-    // See if a user created a CUDA context on the
-    // calling cpu thread.
+    // If a Cuda context already exists, get it
     cuda_err = cuCtxGetCurrentPtr(&pctx);
-    if (cuda_err == CUDA_SUCCESS && pctx != NULL) {
+    if (cuda_err != CUDA_SUCCESS) {
+        return PAPI_EMISC;
+    }
+
+    // Cuda context was found 
+    if (pctx != NULL) { //info[dev_id].ctx != NULL does not work
+        SUBDBG("A Cuda context was found.\n");
         // Get the device id associated with the user created CUDA context
         cuda_err = cuCtxGetDevicePtr(&dev_id);
         if (cuda_err != CUDA_SUCCESS) {
             return PAPI_EMISC;
         }
-        if (info[dev_id].ctx == NULL) {
-            // Store current user created CUDA context
+
+        if(info[dev_id].ctx == NULL) {
+            // Store current user created Cuda context
             cuda_err = cuCtxGetCurrentPtr(&info[dev_id].ctx);
             if (cuda_err != CUDA_SUCCESS) {
                 return PAPI_EMISC;
             }
         }
         else if (info[dev_id].ctx != pctx) {
-            ERRDBG("Warning: cuda context for gpu %d has changed from %p to %p\n", gpu_id, info[gpu_id].ctx, pctx);
+            ERRDBG("Warning: cuda context for device %d has changed from %p to %p\n", dev_id, info[dev_id].ctx, pctx);
         }
-     }
-     // If a user did not create a CUDA context, then we will create one 
-     // for them. Note, that for machine with multiple devices, we need to
-     // call cudaSetDevice.
-     else {
-         // Guard against a user PopCurrent
-         if (info[evt_dev_id].ctx == NULL) {
-             cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC);
-             cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC);
-
-             cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC);
-             cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC);
-         }
-     }
+    }
+    // Cuda context was not found 
+    // Note, that for machines with multiple devices, we need to
+    // call cudaSetDevice. 
+    else {
+        // Guard against a user PopCurrent
+        if (info[evt_dev_id].ctx == NULL) {
+            SUBDBG("A Cuda context was not found. Therefore, one is created for: %d\n", evt_dev_id);
+            cudaArtCheckErrors(cudaSetDevicePtr(evt_dev_id), return PAPI_EMISC);
+            cudaArtCheckErrors(cudaFreePtr(NULL), return PAPI_EMISC);
+
+            cudaCheckErrors(cuCtxGetCurrentPtr(&info[evt_dev_id].ctx), return PAPI_EMISC);
+            cudaCheckErrors(cuCtxPopCurrentPtr(&pctx), PAPI_EMISC);
+        }
+    }
 
     return PAPI_OK;
 }