deepsek commited on
Commit
5422b31
·
1 Parent(s): 851010b

HIP: Enable Matrix cores for MMQ Kernels, Enable stream-K for CDNA 3 (llama/14624)

Browse files

This commit adds support for MFMA instructions to MMQ. CDNA1/GFX908 CDNA2/GFX90a and CDNA3/GFX942 are supported by the MFMA-enabled code path added by this commit. The code path and stream-k is only enabled on CDNA3 for now as it fails to outperform blas in all cases on the other devices.
Blas is currently only consistently outperformed on CDNA3 due to issues in the amd-provided blas libraries.
This commit also improves the awareness of MMQ towards different warp sizes and as a side effect improves the performance of all quant formats besides q4_0 and q4_1, which regress slightly, on GCN gpus.

ggml/src/ggml-cuda/common.cuh CHANGED
@@ -56,7 +56,7 @@
56
  #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
57
  #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
58
  #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
59
- #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
60
  #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
61
  #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
62
 
@@ -72,8 +72,9 @@
72
  #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
73
  #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
74
  #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
75
- #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
76
- #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
 
77
 
78
  // Moore Threads
79
  #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
@@ -226,6 +227,10 @@ typedef float2 dfloat2;
226
  #define FP16_MMA_AVAILABLE
227
  #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
228
 
 
 
 
 
229
  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
230
  #define NEW_MMA_AVAILABLE
231
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -288,6 +293,11 @@ static bool fp32_mma_hardware_available(const int cc) {
288
  return GGML_CUDA_CC_IS_CDNA(cc);
289
  }
290
 
 
 
 
 
 
291
  // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
292
  static bool new_mma_available(const int cc) {
293
  return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
 
56
  #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
57
  #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
58
  #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
59
+ #define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
60
  #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
61
  #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
62
 
 
72
  #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
73
  #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
74
  #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
75
+ #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
76
+ #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
77
+ #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
78
 
79
  // Moore Threads
80
  #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
 
227
  #define FP16_MMA_AVAILABLE
228
  #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
229
 
230
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
231
+ #define AMD_MFMA_AVAILABLE
232
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
233
+
234
  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
235
  #define NEW_MMA_AVAILABLE
236
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 
293
  return GGML_CUDA_CC_IS_CDNA(cc);
294
  }
295
 
296
+ // AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
297
+ static bool amd_mfma_available(const int cc) {
298
+ return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
299
+ }
300
+
301
  // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
302
  static bool new_mma_available(const int cc) {
303
  return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
ggml/src/ggml-cuda/mma.cuh CHANGED
@@ -12,7 +12,8 @@
12
  // The methods get_i and get_j can be used to get the physical 32 bit index of the lth element of a thread within a tile.
13
  // All matrix tiles have ne physical 32 bit elements per warp.
14
  //
15
- // As described in the documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
 
16
 
17
  #include "common.cuh"
18
 
@@ -66,7 +67,44 @@ namespace ggml_cuda_mma {
66
  struct tile {
67
  static constexpr int I = I_;
68
  static constexpr int J = J_;
69
- static constexpr int ne = I * J / WARP_SIZE;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  T x[ne] = {0};
71
 
72
  static __device__ __forceinline__ int get_i(const int l) {
@@ -94,6 +132,7 @@ namespace ggml_cuda_mma {
94
  static_assert(I == -1 && J == -1, "template specialization not implemented");
95
  }
96
  }
 
97
  };
98
 
99
  template <int I_, int J_>
@@ -148,10 +187,23 @@ namespace ggml_cuda_mma {
148
 
149
  template <int I, int J, typename T>
150
  static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
 
 
 
 
 
 
 
 
 
 
 
 
151
  #pragma unroll
152
  for (int l = 0; l < t.ne; ++l) {
153
  t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
154
  }
 
155
  }
156
 
157
  template <typename T>
@@ -186,7 +238,7 @@ namespace ggml_cuda_mma {
186
  template <typename T>
187
  static __device__ __forceinline__ void load_ldmatrix(
188
  tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
189
- #ifdef NEW_MMA_AVAILABLE
190
  int * xi = (int * ) t.x;
191
  const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
192
  asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
@@ -393,4 +445,60 @@ namespace ggml_cuda_mma {
393
  NO_DEVICE_CODE;
394
  #endif // NEW_MMA_AVAILABLE
395
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  }
 
12
  // The methods get_i and get_j can be used to get the physical 32 bit index of the lth element of a thread within a tile.
13
  // All matrix tiles have ne physical 32 bit elements per warp.
14
  //
15
+ // As described in the PTX documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
16
+ // The API in this file also assumes that the pointers for load_generic are aligned to 16 bytes, unaligned pointers are considered undefined behavior.
17
 
18
  #include "common.cuh"
19
 
 
67
  struct tile {
68
  static constexpr int I = I_;
69
  static constexpr int J = J_;
70
+
71
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
72
+ static constexpr int ne = I * J / 64;
73
+ T x[ne] = {0};
74
+
75
+ static __device__ __forceinline__ int get_i(const int l) {
76
+ if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
77
+ return threadIdx.x % 16;
78
+ } else if constexpr (I == 16 && J == 8) {
79
+ return threadIdx.x % 16;
80
+ } else if constexpr (I == 32 && J == 4) {
81
+ return threadIdx.x % 32;
82
+ } else if constexpr (I == 16 && J == 16) {
83
+ return 4 * (threadIdx.x / 16) + l;
84
+ } else if constexpr (I == 32 && J == 32) {
85
+ return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
86
+ } else {
87
+ static_assert(I == -1 && J == -1, "template specialization not implemented");
88
+ }
89
+ }
90
+
91
+ static __device__ __forceinline__ int get_j(const int l) {
92
+ if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
93
+ return (2 * ((threadIdx.x / 16) % 2) + l);
94
+ } else if constexpr (I == 16 && J == 8) {
95
+ return 2 * (threadIdx.x / 16) + l;
96
+ } else if constexpr (I == 32 && J == 4) {
97
+ return 2 * (threadIdx.x / 32) + l;
98
+ } else if constexpr (I == 16 && J == 16) {
99
+ return threadIdx.x % 16;
100
+ } else if constexpr (I == 32 && J == 32) {
101
+ return threadIdx.x % 32;
102
+ } else {
103
+ static_assert(I == -1 && J == -1, "template specialization not implemented");
104
+ }
105
+ }
106
+ #else
107
+ static constexpr int ne = I * J / 32;
108
  T x[ne] = {0};
109
 
110
  static __device__ __forceinline__ int get_i(const int l) {
 
132
  static_assert(I == -1 && J == -1, "template specialization not implemented");
133
  }
134
  }
135
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
136
  };
137
 
138
  template <int I_, int J_>
 
187
 
188
  template <int I, int J, typename T>
189
  static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
190
+ #if defined(AMD_MFMA_AVAILABLE)
191
+ if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
192
+ #pragma unroll
193
+ for (int l = 0; l < t.ne; ++l) {
194
+ t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
195
+ }
196
+ } else {
197
+ int64_t * xi = (int64_t *) t.x;
198
+ const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
199
+ xi[0] = xs[0];
200
+ }
201
+ #else
202
  #pragma unroll
203
  for (int l = 0; l < t.ne; ++l) {
204
  t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
205
  }
206
+ #endif // defined(AMD_MFMA_AVAILABLE)
207
  }
208
 
209
  template <typename T>
 
238
  template <typename T>
239
  static __device__ __forceinline__ void load_ldmatrix(
240
  tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
241
+ #if defined(NEW_MMA_AVAILABLE)
242
  int * xi = (int * ) t.x;
243
  const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
244
  asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
 
445
  NO_DEVICE_CODE;
446
  #endif // NEW_MMA_AVAILABLE
447
  }
448
+
449
+ static __device__ __forceinline__ void mma(
450
+ tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
451
+ #if defined(AMD_MFMA_AVAILABLE)
452
+ using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
453
+ int32x4_t * acc = (int32x4_t *) D.x;
454
+ #if defined(CDNA3)
455
+ acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
456
+ ((int64_t *) B.x)[0],
457
+ acc[0],
458
+ 0, 0, 0);
459
+ #elif defined(CDNA2) || defined(CDNA)
460
+ acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
461
+ B.x[0],
462
+ acc[0],
463
+ 0, 0, 0);
464
+ acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
465
+ B.x[1],
466
+ acc[0],
467
+ 0, 0, 0);
468
+ #endif // defined(CDNA3)
469
+ #else
470
+ GGML_UNUSED(D);
471
+ GGML_UNUSED(A);
472
+ GGML_UNUSED(B);
473
+ NO_DEVICE_CODE;
474
+ #endif // AMD_MFMA_AVAILABLE
475
+ }
476
+
477
+ static __device__ __forceinline__ void mma(
478
+ tile<32, 32, int> & D, const tile<32, 4, int> & A, const tile<32, 4, int> & B) {
479
+ #if defined(AMD_MFMA_AVAILABLE)
480
+ using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
481
+ int32x16_t * acc = (int32x16_t *) D.x;
482
+ #if defined(CDNA3)
483
+ acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
484
+ ((int64_t *) B.x)[0],
485
+ acc[0],
486
+ 0, 0, 0);
487
+ #elif defined(CDNA2) || defined(CDNA)
488
+ acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
489
+ B.x[0],
490
+ acc[0],
491
+ 0, 0, 0);
492
+ acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
493
+ B.x[1],
494
+ acc[0],
495
+ 0, 0, 0);
496
+ #endif // defined(CDNA3)
497
+ #else
498
+ GGML_UNUSED(D);
499
+ GGML_UNUSED(A);
500
+ GGML_UNUSED(B);
501
+ NO_DEVICE_CODE;
502
+ #endif // AMD_MFMA_AVAILABLE
503
+ }
504
  }
ggml/src/ggml-cuda/mmq.cu CHANGED
@@ -109,7 +109,8 @@ void ggml_cuda_mul_mat_q(
109
  const int64_t s03 = src0->nb[3] / ts_src0;
110
  const int64_t s3 = dst->nb[3] / ts_dst;
111
 
112
- const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
 
113
 
114
  if (!ids) {
115
  const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -250,8 +251,9 @@ void ggml_cuda_op_mul_mat_q(
250
  // The stream-k decomposition is only faster for recent NVIDIA GPUs.
251
  // Also its fixup needs to allocate a temporary buffer in the memory pool.
252
  // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
253
- const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
254
- ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
 
255
  const mmq_args args = {
256
  src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
257
  ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
@@ -304,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
304
  return false;
305
  }
306
 
307
- if (new_mma_available(cc)) {
308
  return true;
309
  }
310
 
 
109
  const int64_t s03 = src0->nb[3] / ts_src0;
110
  const int64_t s3 = dst->nb[3] / ts_dst;
111
 
112
+ const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
113
+ || (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
114
 
115
  if (!ids) {
116
  const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
 
251
  // The stream-k decomposition is only faster for recent NVIDIA GPUs.
252
  // Also its fixup needs to allocate a temporary buffer in the memory pool.
253
  // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
254
+ const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
255
+ || (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
256
+ && src1_ncols == ne11;
257
  const mmq_args args = {
258
  src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
259
  ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
 
306
  return false;
307
  }
308
 
309
+ if (new_mma_available(cc) || amd_mfma_available(cc)) {
310
  return true;
311
  }
312
 
ggml/src/ggml-cuda/mmq.cuh CHANGED
The diff for this file is too large to render. See raw diff
 
ggml/src/ggml-cuda/vendors/hip.h CHANGED
@@ -160,7 +160,19 @@
160
  #endif
161
 
162
  #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
163
- #define CDNA
 
 
 
 
 
 
 
 
 
 
 
 
164
  #endif
165
 
166
  #if defined(__GFX12__)
 
160
  #endif
161
 
162
  #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
163
+ #define CDNA // For the entire family
164
+ #endif
165
+
166
+ #if defined(__gfx942__)
167
+ #define CDNA3
168
+ #endif
169
+
170
+ #if defined(__gfx90a__)
171
+ #define CDNA2
172
+ #endif
173
+
174
+ #if defined(__gfx908__)
175
+ #define CDNA1
176
  #endif
177
 
178
  #if defined(__GFX12__)