uvos JohannesGaessler commited on
Commit
1a9d2d3
·
1 Parent(s): 2d1e6e7

CUDA/HIP: optimize mmv paths taken for HIP devices (llama/14324)

Browse files
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -263,7 +263,11 @@ static bool fp16_mma_hardware_available(const int cc) {
263
  }
264
 
265
  static bool bf16_mma_hardware_available(const int cc) {
266
- return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;
 
 
 
 
267
  }
268
 
269
  // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
 
263
  }
264
 
265
  static bool bf16_mma_hardware_available(const int cc) {
266
+ return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
267
+ }
268
+
269
+ static bool fp32_mma_hardware_available(const int cc) {
270
+ return GGML_CUDA_CC_IS_CDNA(cc);
271
  }
272
 
273
  // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
ggml/src/ggml-cuda/mmv.cu CHANGED
@@ -456,6 +456,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
456
  return ne11 <= 4;
457
  }
458
  return ne11 <= 3;
 
 
 
 
 
459
  }
460
  return ne11 <= 8;
461
  case GGML_TYPE_F16:
@@ -468,6 +473,14 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
468
  return src0_small && ne11 <= 3;
469
  }
470
  return ne11 <= 8;
 
 
 
 
 
 
 
 
471
  }
472
  return ne11 <= 8;
473
  case GGML_TYPE_BF16:
@@ -480,6 +493,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
480
  return src0_small && ne11 <= 3;
481
  }
482
  return ne11 <= 8;
 
 
 
 
 
483
  }
484
  return ne11 <= 8;
485
  default:
 
456
  return ne11 <= 4;
457
  }
458
  return ne11 <= 3;
459
+ } else if (GGML_CUDA_CC_IS_AMD(cc)) {
460
+ if (fp32_mma_hardware_available(cc)) {
461
+ return ne11 <= 3;
462
+ }
463
+ return ne11 <= 8;
464
  }
465
  return ne11 <= 8;
466
  case GGML_TYPE_F16:
 
473
  return src0_small && ne11 <= 3;
474
  }
475
  return ne11 <= 8;
476
+ } else if (GGML_CUDA_CC_IS_AMD(cc)) {
477
+ if (fp16_mma_hardware_available(cc)) {
478
+ if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
479
+ return ne11 <= 5;
480
+ }
481
+ return ne11 <= 2;
482
+ }
483
+ return ne11 <= 8;
484
  }
485
  return ne11 <= 8;
486
  case GGML_TYPE_BF16:
 
493
  return src0_small && ne11 <= 3;
494
  }
495
  return ne11 <= 8;
496
+ } else if (GGML_CUDA_CC_IS_AMD(cc)) {
497
+ if (bf16_mma_hardware_available(cc)) {
498
+ return ne11 <= 3;
499
+ }
500
+ return ne11 <= 8;
501
  }
502
  return ne11 <= 8;
503
  default: