Spaces:
Sleeping
Sleeping
HIP: enable mfma mmq on gfx908 and gfx90a for select datatypes and shapes (llama/14949)
Browse files- ggml/src/ggml-cuda/common.cuh +3 -4
- ggml/src/ggml-cuda/mmq.cu +20 -4
- ggml/src/ggml-cuda/mmq.cuh +2 -2
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -227,9 +227,9 @@ typedef float2 dfloat2;
|
|
| 227 |
#define FP16_MMA_AVAILABLE
|
| 228 |
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
| 229 |
|
| 230 |
-
#if defined(GGML_USE_HIP) && defined(
|
| 231 |
#define AMD_MFMA_AVAILABLE
|
| 232 |
-
#endif // defined(GGML_USE_HIP) && defined(
|
| 233 |
|
| 234 |
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
| 235 |
#define NEW_MMA_AVAILABLE
|
|
@@ -293,10 +293,9 @@ static bool fp32_mma_hardware_available(const int cc) {
|
|
| 293 |
return GGML_CUDA_CC_IS_CDNA(cc);
|
| 294 |
}
|
| 295 |
|
| 296 |
-
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
|
| 297 |
static bool amd_mfma_available(const int cc) {
|
| 298 |
#if !defined(GGML_HIP_NO_MMQ_MFMA)
|
| 299 |
-
return
|
| 300 |
#else
|
| 301 |
return false;
|
| 302 |
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
|
|
|
|
| 227 |
#define FP16_MMA_AVAILABLE
|
| 228 |
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
| 229 |
|
| 230 |
+
#if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
| 231 |
#define AMD_MFMA_AVAILABLE
|
| 232 |
+
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
| 233 |
|
| 234 |
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
| 235 |
#define NEW_MMA_AVAILABLE
|
|
|
|
| 293 |
return GGML_CUDA_CC_IS_CDNA(cc);
|
| 294 |
}
|
| 295 |
|
|
|
|
| 296 |
static bool amd_mfma_available(const int cc) {
|
| 297 |
#if !defined(GGML_HIP_NO_MMQ_MFMA)
|
| 298 |
+
return GGML_CUDA_CC_IS_CDNA(cc);
|
| 299 |
#else
|
| 300 |
return false;
|
| 301 |
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
|
ggml/src/ggml-cuda/mmq.cu
CHANGED
|
@@ -109,8 +109,8 @@ void ggml_cuda_mul_mat_q(
|
|
| 109 |
const int64_t s03 = src0->nb[3] / ts_src0;
|
| 110 |
const int64_t s3 = dst->nb[3] / ts_dst;
|
| 111 |
|
| 112 |
-
const bool use_stream_k = (
|
| 113 |
-
|| (
|
| 114 |
|
| 115 |
if (!ids) {
|
| 116 |
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
|
@@ -252,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
|
|
| 252 |
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
| 253 |
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
| 254 |
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
| 255 |
-
|| (
|
| 256 |
&& src1_ncols == ne11;
|
| 257 |
const mmq_args args = {
|
| 258 |
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
|
@@ -306,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
| 306 |
return false;
|
| 307 |
}
|
| 308 |
|
| 309 |
-
if (new_mma_available(cc)
|
| 310 |
return true;
|
| 311 |
}
|
| 312 |
|
|
@@ -322,5 +322,21 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
| 322 |
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 323 |
}
|
| 324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 326 |
}
|
|
|
|
| 109 |
const int64_t s03 = src0->nb[3] / ts_src0;
|
| 110 |
const int64_t s3 = dst->nb[3] / ts_dst;
|
| 111 |
|
| 112 |
+
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
| 113 |
+
|| GGML_CUDA_CC_IS_CDNA(cc);
|
| 114 |
|
| 115 |
if (!ids) {
|
| 116 |
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
|
|
|
| 252 |
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
| 253 |
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
| 254 |
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
| 255 |
+
|| GGML_CUDA_CC_IS_CDNA(cc))
|
| 256 |
&& src1_ncols == ne11;
|
| 257 |
const mmq_args args = {
|
| 258 |
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
|
|
|
| 306 |
return false;
|
| 307 |
}
|
| 308 |
|
| 309 |
+
if (new_mma_available(cc)) {
|
| 310 |
return true;
|
| 311 |
}
|
| 312 |
|
|
|
|
| 322 |
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 323 |
}
|
| 324 |
|
| 325 |
+
if (amd_mfma_available(cc)) {
|
| 326 |
+
// As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
|
| 327 |
+
// performs better but is currently suffering from a crash on this architecture.
|
| 328 |
+
// TODO: Revisit when hipblaslt is fixed on CDNA3
|
| 329 |
+
if (GGML_CUDA_CC_IS_CDNA3(cc)) {
|
| 330 |
+
return true;
|
| 331 |
+
}
|
| 332 |
+
if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
|
| 333 |
+
return true;
|
| 334 |
+
}
|
| 335 |
+
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
|
| 336 |
+
return true;
|
| 337 |
+
}
|
| 338 |
+
return false;
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 342 |
}
|
ggml/src/ggml-cuda/mmq.cuh
CHANGED
|
@@ -3096,8 +3096,8 @@ static __global__ void mul_mat_q(
|
|
| 3096 |
}
|
| 3097 |
__syncthreads();
|
| 3098 |
|
| 3099 |
-
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
| 3100 |
-
#if (defined(GGML_USE_HIP) && !defined(
|
| 3101 |
{
|
| 3102 |
const int wt = blockIdx.z / nchannels_y;
|
| 3103 |
const int zt = blockIdx.z - wt*nchannels_y;
|
|
|
|
| 3096 |
}
|
| 3097 |
__syncthreads();
|
| 3098 |
|
| 3099 |
+
// On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
| 3100 |
+
#if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
| 3101 |
{
|
| 3102 |
const int wt = blockIdx.z / nchannels_y;
|
| 3103 |
const int zt = blockIdx.z - wt*nchannels_y;
|