uvos commited on
Commit
149f5a5
·
1 Parent(s): 0c60f80

HIP: enable mfma mmq on gfx908 and gfx90a for select datatypes and shapes (llama/14949)

Browse files
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -227,9 +227,9 @@ typedef float2 dfloat2;
227
  #define FP16_MMA_AVAILABLE
228
  #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
229
 
230
- #if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
231
  #define AMD_MFMA_AVAILABLE
232
- #endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
233
 
234
  #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
235
  #define NEW_MMA_AVAILABLE
@@ -293,10 +293,9 @@ static bool fp32_mma_hardware_available(const int cc) {
293
  return GGML_CUDA_CC_IS_CDNA(cc);
294
  }
295
 
296
- // AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
297
  static bool amd_mfma_available(const int cc) {
298
  #if !defined(GGML_HIP_NO_MMQ_MFMA)
299
- return GGML_CUDA_CC_IS_CDNA3(cc);
300
  #else
301
  return false;
302
  #endif //!defined(GGML_HIP_NO_MMQ_MFMA)
 
227
  #define FP16_MMA_AVAILABLE
228
  #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
229
 
230
+ #if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
231
  #define AMD_MFMA_AVAILABLE
232
+ #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
233
 
234
  #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
235
  #define NEW_MMA_AVAILABLE
 
293
  return GGML_CUDA_CC_IS_CDNA(cc);
294
  }
295
 
 
296
  static bool amd_mfma_available(const int cc) {
297
  #if !defined(GGML_HIP_NO_MMQ_MFMA)
298
+ return GGML_CUDA_CC_IS_CDNA(cc);
299
  #else
300
  return false;
301
  #endif //!defined(GGML_HIP_NO_MMQ_MFMA)
ggml/src/ggml-cuda/mmq.cu CHANGED
@@ -109,8 +109,8 @@ void ggml_cuda_mul_mat_q(
109
  const int64_t s03 = src0->nb[3] / ts_src0;
110
  const int64_t s3 = dst->nb[3] / ts_dst;
111
 
112
- const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
113
- || (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
114
 
115
  if (!ids) {
116
  const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -252,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
252
  // Also its fixup needs to allocate a temporary buffer in the memory pool.
253
  // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
254
  const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
255
- || (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
256
  && src1_ncols == ne11;
257
  const mmq_args args = {
258
  src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
@@ -306,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
306
  return false;
307
  }
308
 
309
- if (new_mma_available(cc) || amd_mfma_available(cc)) {
310
  return true;
311
  }
312
 
@@ -322,5 +322,21 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
322
  return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
323
  }
324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
326
  }
 
109
  const int64_t s03 = src0->nb[3] / ts_src0;
110
  const int64_t s3 = dst->nb[3] / ts_dst;
111
 
112
+ const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
113
+ || GGML_CUDA_CC_IS_CDNA(cc);
114
 
115
  if (!ids) {
116
  const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
 
252
  // Also its fixup needs to allocate a temporary buffer in the memory pool.
253
  // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
254
  const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
255
+ || GGML_CUDA_CC_IS_CDNA(cc))
256
  && src1_ncols == ne11;
257
  const mmq_args args = {
258
  src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
 
306
  return false;
307
  }
308
 
309
+ if (new_mma_available(cc)) {
310
  return true;
311
  }
312
 
 
322
  return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
323
  }
324
 
325
+ if (amd_mfma_available(cc)) {
326
+ // As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
327
+ // performs better but is currently suffering from a crash on this architecture.
328
+ // TODO: Revisit when hipblaslt is fixed on CDNA3
329
+ if (GGML_CUDA_CC_IS_CDNA3(cc)) {
330
+ return true;
331
+ }
332
+ if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
333
+ return true;
334
+ }
335
+ if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
336
+ return true;
337
+ }
338
+ return false;
339
+ }
340
+
341
  return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
342
  }
ggml/src/ggml-cuda/mmq.cuh CHANGED
@@ -3096,8 +3096,8 @@ static __global__ void mul_mat_q(
3096
  }
3097
  __syncthreads();
3098
 
3099
- // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
3100
- #if (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
3101
  {
3102
  const int wt = blockIdx.z / nchannels_y;
3103
  const int zt = blockIdx.z - wt*nchannels_y;
 
3096
  }
3097
  __syncthreads();
3098
 
3099
+ // On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
3100
+ #if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
3101
  {
3102
  const int wt = blockIdx.z / nchannels_y;
3103
  const int zt = blockIdx.z - wt*nchannels_y;