Spaces:
Sleeping
Sleeping
Commit
·
e9c9d4b
1
Parent(s):
3436ba4
CUDA: fix q_nope_absorbed prec for DS 2 Lite f16 (llama/13137)
Browse files- ggml/include/ggml.h +2 -2
- ggml/src/ggml-cuda/ggml-cuda.cu +2 -2
ggml/include/ggml.h
CHANGED
|
@@ -393,8 +393,8 @@ extern "C" {
|
|
| 393 |
|
| 394 |
// precision
|
| 395 |
enum ggml_prec {
|
| 396 |
-
GGML_PREC_DEFAULT,
|
| 397 |
-
GGML_PREC_F32,
|
| 398 |
};
|
| 399 |
|
| 400 |
// model file types
|
|
|
|
| 393 |
|
| 394 |
// precision
|
| 395 |
enum ggml_prec {
|
| 396 |
+
GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
|
| 397 |
+
GGML_PREC_F32 = 10,
|
| 398 |
};
|
| 399 |
|
| 400 |
// model file types
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -1935,8 +1935,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
| 1935 |
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
|
| 1936 |
} else if (!split && use_mul_mat_vec_q) {
|
| 1937 |
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
|
| 1938 |
-
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
| 1939 |
-
|
| 1940 |
// general KQ + KQV multi-batch without FlashAttention
|
| 1941 |
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
| 1942 |
} else if (use_mul_mat_vec) {
|
|
|
|
| 1935 |
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
|
| 1936 |
} else if (!split && use_mul_mat_vec_q) {
|
| 1937 |
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
|
| 1938 |
+
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
|
| 1939 |
+
dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
| 1940 |
// general KQ + KQV multi-batch without FlashAttention
|
| 1941 |
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
| 1942 |
} else if (use_mul_mat_vec) {
|