CNugteren · fancyIX · Oct 24, 2023 · Oct 24, 2023 · Oct 28, 2023 · Oct 29, 2023
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
@@ -138,6 +138,25 @@ R"(
   #endif
 #endif
 
+#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_GCN == 1
+  #define SUBGROUP_SIZE 32              // Assumes subgroup size is always 4 on AMD GCN GPUs
+    #define NAVI_SHFL(s0, l)  \
+    { \
+		__asm ( \
+		  "ds_bpermute_b32  %[dos0], %[ol0], %[os0]\n" \
+       "s_waitcnt lgkmcnt(0)\n" \
+		  : [dos0] "=&v" (s0) \
+		  : [ol0] "v" (l), \
+            [os0] "0" (s0)); \
+	 }
+   #define NAVI_LID() \
+     if (get_work_dim() == 2) { \
+      return (get_local_size(0) * get_local_id(1) + get_local_id(0)) % SUBGROUP_SIZE; \
+    } else { \
+      return (get_local_id(0)) % SUBGROUP_SIZE; \
+    }
+#endif
+
 #if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
   #undef USE_SUBGROUP_SHUFFLING
   #define USE_SUBGROUP_SHUFFLING 0     // Disables subgroups in case the assumptions don't hold

diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
@@ -30,6 +30,8 @@ INLINE_FUNC int clblast_get_sub_group_local_id() {
   int ret;
   asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
   return ret;
+  #elif SUBGROUP_SHUFFLING_GCN == 1
+    NAVI_LID()
   #endif 
 }
 
@@ -49,6 +51,10 @@ INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
     asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
     #endif
   return ret;
+  #elif SUBGROUP_SHUFFLING_GCN == 1
+  realN ret = reg;
+  NAVI_SHFL(ret, ((src) << 2))
+  return ret;
   #endif
 }
 #endif

diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
@@ -78,6 +78,11 @@ std::shared_ptr<Program> CompileFromSource(
     }
   }
 
+  if (device.IsGPU() && device.IsAMD() && device.Name().find("gfx1") != std::string::npos) {
+    header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
+    header_string += "#define SUBGROUP_SHUFFLING_GCN 1\n";
+  }
+
   // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance.
   // This option compiles without the workgroup size requirement and does not affect correctness.
   if (device.IsQualcomm()) {