Spaces:
Running
Running
Commit
·
1a9d2d3
1
Parent(s):
2d1e6e7
CUDA/HIP: optimize mmv paths taken for HIP devices (llama/14324)
Browse files- ggml/src/ggml-cuda/common.cuh +5 -1
- ggml/src/ggml-cuda/mmv.cu +18 -0
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -263,7 +263,11 @@ static bool fp16_mma_hardware_available(const int cc) {
|
|
| 263 |
}
|
| 264 |
|
| 265 |
static bool bf16_mma_hardware_available(const int cc) {
|
| 266 |
-
return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
}
|
| 268 |
|
| 269 |
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
|
|
|
| 263 |
}
|
| 264 |
|
| 265 |
static bool bf16_mma_hardware_available(const int cc) {
|
| 266 |
+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
static bool fp32_mma_hardware_available(const int cc) {
|
| 270 |
+
return GGML_CUDA_CC_IS_CDNA(cc);
|
| 271 |
}
|
| 272 |
|
| 273 |
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
ggml/src/ggml-cuda/mmv.cu
CHANGED
|
@@ -456,6 +456,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
|
|
| 456 |
return ne11 <= 4;
|
| 457 |
}
|
| 458 |
return ne11 <= 3;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
}
|
| 460 |
return ne11 <= 8;
|
| 461 |
case GGML_TYPE_F16:
|
|
@@ -468,6 +473,14 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
|
|
| 468 |
return src0_small && ne11 <= 3;
|
| 469 |
}
|
| 470 |
return ne11 <= 8;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
}
|
| 472 |
return ne11 <= 8;
|
| 473 |
case GGML_TYPE_BF16:
|
|
@@ -480,6 +493,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
|
|
| 480 |
return src0_small && ne11 <= 3;
|
| 481 |
}
|
| 482 |
return ne11 <= 8;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
}
|
| 484 |
return ne11 <= 8;
|
| 485 |
default:
|
|
|
|
| 456 |
return ne11 <= 4;
|
| 457 |
}
|
| 458 |
return ne11 <= 3;
|
| 459 |
+
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
|
| 460 |
+
if (fp32_mma_hardware_available(cc)) {
|
| 461 |
+
return ne11 <= 3;
|
| 462 |
+
}
|
| 463 |
+
return ne11 <= 8;
|
| 464 |
}
|
| 465 |
return ne11 <= 8;
|
| 466 |
case GGML_TYPE_F16:
|
|
|
|
| 473 |
return src0_small && ne11 <= 3;
|
| 474 |
}
|
| 475 |
return ne11 <= 8;
|
| 476 |
+
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
|
| 477 |
+
if (fp16_mma_hardware_available(cc)) {
|
| 478 |
+
if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
| 479 |
+
return ne11 <= 5;
|
| 480 |
+
}
|
| 481 |
+
return ne11 <= 2;
|
| 482 |
+
}
|
| 483 |
+
return ne11 <= 8;
|
| 484 |
}
|
| 485 |
return ne11 <= 8;
|
| 486 |
case GGML_TYPE_BF16:
|
|
|
|
| 493 |
return src0_small && ne11 <= 3;
|
| 494 |
}
|
| 495 |
return ne11 <= 8;
|
| 496 |
+
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
|
| 497 |
+
if (bf16_mma_hardware_available(cc)) {
|
| 498 |
+
return ne11 <= 3;
|
| 499 |
+
}
|
| 500 |
+
return ne11 <= 8;
|
| 501 |
}
|
| 502 |
return ne11 <= 8;
|
| 503 |
default:
|