whisper.cpp

Running

JohannesGaessler commited on Feb 3

Commit

6df9571

1 Parent(s): acfd94f

CUDA: fix Volta FlashAttention logic (llama/11615)

Files changed (2) hide show

ggml/src/ggml-cuda/fattn-wmma-f16.cu CHANGED Viewed

@@ -561,7 +561,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
                     ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
                     break;
                 // case 256:
-                //     ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
                 //     break;
                 default:
                     GGML_ABORT("fatal error");

                     ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
                     break;
                 // case 256:
+                //     ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
                 //     break;
                 default:
                     GGML_ABORT("fatal error");

ggml/src/ggml-cuda/fattn.cu CHANGED Viewed

@@ -235,7 +235,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
         return;
     }
-    if (!new_mma_available(cc)) {
         if (prec == GGML_PREC_DEFAULT) {
             if (Q->ne[1] <= 8) {
                 ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
@@ -265,6 +265,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     // The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
     if (cc == GGML_CUDA_CC_VOLTA) {
         ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
     }
     ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);

         return;
     }
+    if (!fp16_mma_available(cc)) {
         if (prec == GGML_PREC_DEFAULT) {
             if (Q->ne[1] <= 8) {
                 ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
     // The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
     if (cc == GGML_CUDA_CC_VOLTA) {
         ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
+        return;
     }
     ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);