Spaces:
Running
Running
Commit
·
af4dff1
1
Parent(s):
d324d0b
CUDA: fix MMV kernel being used for FP16 src1 (llama/10357)
Browse files
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -1760,11 +1760,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
| 1760 |
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
| 1761 |
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
| 1762 |
|
| 1763 |
-
if (!split &&
|
|
|
|
|
|
|
| 1764 |
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
|
| 1765 |
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
| 1766 |
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
| 1767 |
-
// KQ + KQV multi-batch without FlashAttention
|
| 1768 |
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
| 1769 |
} else if (use_mul_mat_vec) {
|
| 1770 |
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
|
|
|
|
| 1760 |
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
| 1761 |
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
| 1762 |
|
| 1763 |
+
if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
|
| 1764 |
+
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
|
| 1765 |
+
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
|
| 1766 |
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
|
| 1767 |
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
| 1768 |
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
| 1769 |
+
// general KQ + KQV multi-batch without FlashAttention
|
| 1770 |
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
| 1771 |
} else if (use_mul_mat_vec) {
|
| 1772 |
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
|