JohannesGaessler commited on
Commit
e9c9d4b
·
1 Parent(s): 3436ba4

CUDA: fix q_nope_absorbed prec for DS 2 Lite f16 (llama/13137)

Browse files
ggml/include/ggml.h CHANGED
@@ -393,8 +393,8 @@ extern "C" {
393
 
394
  // precision
395
  enum ggml_prec {
396
- GGML_PREC_DEFAULT,
397
- GGML_PREC_F32,
398
  };
399
 
400
  // model file types
 
393
 
394
  // precision
395
  enum ggml_prec {
396
+ GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
397
+ GGML_PREC_F32 = 10,
398
  };
399
 
400
  // model file types
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -1935,8 +1935,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1935
  ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
1936
  } else if (!split && use_mul_mat_vec_q) {
1937
  ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
1938
- } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
1939
- && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1940
  // general KQ + KQV multi-batch without FlashAttention
1941
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1942
  } else if (use_mul_mat_vec) {
 
1935
  ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
1936
  } else if (!split && use_mul_mat_vec_q) {
1937
  ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
1938
+ } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
1939
+ dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
1940
  // general KQ + KQV multi-batch without FlashAttention
1941
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
1942
  } else if (use_mul_mat_vec) {