JohannesGaessler commited on
Commit
849ff52
·
1 Parent(s): ded0c68

CUDA: refactor mmq, dmmv, mmvq (llama/7716)

Browse files

* CUDA: refactor mmq, dmmv, mmvq

* fix out-of-bounds write

* struct for qk, qr, qi

* fix cmake build

* mmq_type_traits

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. ggml-common.h +6 -0
  2. ggml-cuda.cu +9 -75
  3. ggml-cuda/common.cuh +156 -1
  4. ggml-cuda/dmmv.cu +21 -9
  5. ggml-cuda/mmq.cu +14 -1491
  6. ggml-cuda/mmq.cuh +1300 -0
  7. ggml-cuda/mmvq.cu +76 -61
  8. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  9. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  10. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  11. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  12. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  13. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  14. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  15. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  16. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  17. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  18. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  19. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  20. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  21. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  22. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  23. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  24. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  25. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  26. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  27. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  28. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  29. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  30. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  31. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  32. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  33. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  34. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  35. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  36. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  37. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  38. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  39. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  40. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  41. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  42. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  43. ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  44. ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  45. ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  46. ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  47. ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  48. ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  49. ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  50. ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
ggml-common.h CHANGED
@@ -123,12 +123,18 @@ typedef sycl::half2 ggml_half2;
123
  #define QI1_S (QK_K / (4*QR1_S))
124
  #define QR1_S 8
125
 
 
 
 
126
  #define QI4_NL (QK4_NL / (4*QR4_NL))
127
  #define QR4_NL 2
128
 
129
  #define QI4_XS (QK_K / (4*QR4_XS))
130
  #define QR4_XS 8
131
 
 
 
 
132
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
133
 
134
  #define QK4_0 32
 
123
  #define QI1_S (QK_K / (4*QR1_S))
124
  #define QR1_S 8
125
 
126
+ #define QI1_M (QK_K / (4*QR1_M))
127
+ #define QR1_M 8
128
+
129
  #define QI4_NL (QK4_NL / (4*QR4_NL))
130
  #define QR4_NL 2
131
 
132
  #define QI4_XS (QK_K / (4*QR4_XS))
133
  #define QR4_XS 8
134
 
135
+ #define QI3_S (QK_K / (4*QR3_S))
136
+ #define QR3_S 8
137
+
138
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
139
 
140
  #define QK4_0 32
ggml-cuda.cu CHANGED
@@ -633,88 +633,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
633
 
634
  // cuda split buffer
635
 
636
- static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
637
- int64_t min_compute_capability = INT_MAX;
638
- int64_t max_compute_capability = INT_MIN;
639
  for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
640
- if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
641
- if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
642
- min_compute_capability = ggml_cuda_info().devices[id].cc;
643
- }
644
- if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
645
- max_compute_capability = ggml_cuda_info().devices[id].cc;
646
- }
647
  }
648
- }
649
 
650
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
651
- switch(type) {
652
- case GGML_TYPE_Q4_0:
653
- case GGML_TYPE_Q4_1:
654
- case GGML_TYPE_Q5_0:
655
- case GGML_TYPE_Q5_1:
656
- case GGML_TYPE_Q8_0:
657
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
658
- case GGML_TYPE_F16:
659
- case GGML_TYPE_F32:
660
- return 1;
661
- case GGML_TYPE_Q2_K:
662
- return max_compute_capability >= CC_RDNA2 ? 128 : 32;
663
- case GGML_TYPE_Q3_K:
664
- return min_compute_capability < CC_RDNA2 ? 128 : 64;
665
- case GGML_TYPE_Q4_K:
666
- case GGML_TYPE_Q5_K:
667
- case GGML_TYPE_Q6_K:
668
- case GGML_TYPE_IQ2_XXS:
669
- case GGML_TYPE_IQ2_XS:
670
- case GGML_TYPE_IQ2_S:
671
- case GGML_TYPE_IQ3_XXS:
672
- case GGML_TYPE_IQ1_S:
673
- case GGML_TYPE_IQ1_M:
674
- case GGML_TYPE_IQ4_NL:
675
- case GGML_TYPE_IQ4_XS:
676
- case GGML_TYPE_IQ3_S:
677
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
678
- default:
679
- GGML_ASSERT(false);
680
- }
681
- #else
682
- switch(type) {
683
- case GGML_TYPE_Q4_0:
684
- case GGML_TYPE_Q4_1:
685
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
686
- case GGML_TYPE_Q5_0:
687
- case GGML_TYPE_Q5_1:
688
- case GGML_TYPE_Q8_0:
689
- return 64;
690
- case GGML_TYPE_F16:
691
- case GGML_TYPE_F32:
692
- return 1;
693
- case GGML_TYPE_Q2_K:
694
- case GGML_TYPE_Q3_K:
695
- case GGML_TYPE_Q4_K:
696
- case GGML_TYPE_Q5_K:
697
- case GGML_TYPE_IQ2_XXS:
698
- case GGML_TYPE_IQ2_XS:
699
- case GGML_TYPE_IQ2_S:
700
- case GGML_TYPE_IQ3_XXS:
701
- case GGML_TYPE_IQ1_S:
702
- case GGML_TYPE_IQ1_M:
703
- case GGML_TYPE_IQ4_NL:
704
- case GGML_TYPE_IQ4_XS:
705
- case GGML_TYPE_IQ3_S:
706
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
707
- case GGML_TYPE_Q6_K:
708
- return 64;
709
- default:
710
- GGML_ASSERT(false);
711
  }
712
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
713
  }
714
 
715
  static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
716
  const int64_t nrows = ggml_nrows(tensor);
717
- const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
718
 
719
  *row_low = id == 0 ? 0 : nrows*tensor_split[id];
720
  *row_low -= *row_low % rounding;
@@ -1499,7 +1433,7 @@ static void ggml_cuda_op_mul_mat(
1499
  // for multi GPU, get the row boundaries from tensor split
1500
  // and round to mul_mat_q tile sizes
1501
  if (split) {
1502
- const int64_t rounding = get_row_rounding(src0->type, tensor_split);
1503
 
1504
  if (id != 0) {
1505
  dev[id].row_low = ne01*tensor_split[id];
 
633
 
634
  // cuda split buffer
635
 
636
+ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
637
+ int64_t row_rounding = 0;
 
638
  for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
639
+ if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
640
+ continue;
 
 
 
 
 
641
  }
 
642
 
643
+ const int cc = ggml_cuda_info().devices[id].cc;
644
+ row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
  }
646
+ return row_rounding;
647
  }
648
 
649
  static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
650
  const int64_t nrows = ggml_nrows(tensor);
651
+ const int64_t rounding = get_row_rounding(tensor_split);
652
 
653
  *row_low = id == 0 ? 0 : nrows*tensor_split[id];
654
  *row_low -= *row_low % rounding;
 
1433
  // for multi GPU, get the row boundaries from tensor split
1434
  // and round to mul_mat_q tile sizes
1435
  if (split) {
1436
+ const int64_t rounding = get_row_rounding(tensor_split);
1437
 
1438
  if (id != 0) {
1439
  dev[id].row_low = ne01*tensor_split[id];
ggml-cuda/common.cuh CHANGED
@@ -160,7 +160,7 @@
160
  #endif
161
 
162
  #define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
163
- #define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
164
 
165
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
166
 
@@ -484,6 +484,161 @@ static __device__ __forceinline__ float get_alibi_slope(
484
  return powf(base, exph);
485
  }
486
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  //////////////////////
488
 
489
  struct ggml_cuda_device_info {
 
160
  #endif
161
 
162
  #define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
163
+ #define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
164
 
165
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
166
 
 
484
  return powf(base, exph);
485
  }
486
 
487
+ template <ggml_type type>
488
+ struct ggml_cuda_type_traits;
489
+
490
+ template<>
491
+ struct ggml_cuda_type_traits<GGML_TYPE_F16> {
492
+ static constexpr int qk = 1;
493
+ static constexpr int qr = 1;
494
+ };
495
+
496
+ template<>
497
+ struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
498
+ static constexpr int qk = QK4_0;
499
+ static constexpr int qr = QR4_0;
500
+ static constexpr int qi = QI4_0;
501
+ };
502
+
503
+ template<>
504
+ struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
505
+ static constexpr int qk = QK4_1;
506
+ static constexpr int qr = QR4_1;
507
+ static constexpr int qi = QI4_1;
508
+ };
509
+
510
+ template<>
511
+ struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
512
+ static constexpr int qk = QK5_0;
513
+ static constexpr int qr = QR5_0;
514
+ static constexpr int qi = QI5_0;
515
+ };
516
+
517
+ template<>
518
+ struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
519
+ static constexpr int qk = QK5_1;
520
+ static constexpr int qr = QR5_1;
521
+ static constexpr int qi = QI5_1;
522
+ };
523
+
524
+ template<>
525
+ struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
526
+ static constexpr int qk = QK8_0;
527
+ static constexpr int qr = QR8_0;
528
+ static constexpr int qi = QI8_0;
529
+ };
530
+
531
+ template<>
532
+ struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
533
+ static constexpr int qk = QK_K;
534
+ static constexpr int qr = QR2_K;
535
+ static constexpr int qi = QI2_K;
536
+ };
537
+
538
+ template<>
539
+ struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
540
+ static constexpr int qk = QK_K;
541
+ static constexpr int qr = QR3_K;
542
+ static constexpr int qi = QI3_K;
543
+ };
544
+
545
+ template<>
546
+ struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
547
+ static constexpr int qk = QK_K;
548
+ static constexpr int qr = QR4_K;
549
+ static constexpr int qi = QI4_K;
550
+ };
551
+
552
+ template<>
553
+ struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
554
+ static constexpr int qk = QK_K;
555
+ static constexpr int qr = QR5_K;
556
+ static constexpr int qi = QI5_K;
557
+ };
558
+
559
+ template<>
560
+ struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
561
+ static constexpr int qk = QK_K;
562
+ static constexpr int qr = QR6_K;
563
+ static constexpr int qi = QI6_K;
564
+ };
565
+
566
+ template<>
567
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
568
+ static constexpr int qk = QK_K;
569
+ static constexpr int qr = QR2_XXS;
570
+ static constexpr int qi = QI2_XXS;
571
+ };
572
+
573
+ template<>
574
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
575
+ static constexpr int qk = QK_K;
576
+ static constexpr int qr = QR2_XS;
577
+ static constexpr int qi = QI2_XS;
578
+ };
579
+
580
+ template<>
581
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
582
+ static constexpr int qk = QK_K;
583
+ static constexpr int qr = QR2_S;
584
+ static constexpr int qi = QI2_S;
585
+ };
586
+
587
+ template<>
588
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
589
+ static constexpr int qk = QK_K;
590
+ static constexpr int qr = QR3_XXS;
591
+ static constexpr int qi = QI3_XXS;
592
+ };
593
+
594
+ template<>
595
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
596
+ static constexpr int qk = QK_K;
597
+ static constexpr int qr = QR1_S;
598
+ static constexpr int qi = QI1_S;
599
+ };
600
+
601
+ template<>
602
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
603
+ static constexpr int qk = QK_K;
604
+ static constexpr int qr = QR1_M;
605
+ static constexpr int qi = QI1_M;
606
+ };
607
+
608
+ template<>
609
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
610
+ static constexpr int qk = QK4_NL;
611
+ static constexpr int qr = QR4_NL;
612
+ static constexpr int qi = QI4_NL;
613
+ };
614
+
615
+ template<>
616
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
617
+ static constexpr int qk = QK_K;
618
+ static constexpr int qr = QR4_XS;
619
+ static constexpr int qi = QI4_XS;
620
+ };
621
+
622
+ template<>
623
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
624
+ static constexpr int qk = QK_K;
625
+ static constexpr int qr = QR3_S;
626
+ static constexpr int qi = QI3_S;
627
+ };
628
+
629
+ static int get_mmq_x_max_host(const int cc) {
630
+ #ifdef CUDA_USE_TENSOR_CORES
631
+ return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
632
+ #else
633
+ return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
634
+ #endif // CUDA_USE_TENSOR_CORES
635
+ }
636
+
637
+ // Round rows to this value for --split-mode row:
638
+ static int get_mmq_y_host(const int cc, const int mmq_x) {
639
+ return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
640
+ }
641
+
642
  //////////////////////
643
 
644
  struct ggml_cuda_device_info {
ggml-cuda/dmmv.cu CHANGED
@@ -422,10 +422,22 @@ static __device__ void convert_f16(const void * vx, const int64_t ib, const int
422
  v.y = x[ib + iqs + 1];
423
  }
424
 
425
- template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
 
 
 
 
 
 
 
 
 
 
426
  static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
427
- // qk = quantized weights per x block
428
- // qr = number of quantized weights per data value in x block
 
 
429
  const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
430
 
431
  if (row >= nrows) {
@@ -493,7 +505,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
493
  // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
494
  const dim3 block_nums(block_num_y, 1, 1);
495
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
496
- dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
497
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
498
  }
499
 
@@ -502,7 +514,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
502
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
503
  const dim3 block_nums(block_num_y, 1, 1);
504
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
505
- dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
506
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
507
  }
508
 
@@ -511,7 +523,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
511
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
512
  const dim3 block_nums(block_num_y, 1, 1);
513
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
514
- dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
515
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
516
  }
517
 
@@ -520,7 +532,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
520
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
521
  const dim3 block_nums(block_num_y, 1, 1);
522
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
523
- dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
524
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
525
  }
526
 
@@ -529,7 +541,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y,
529
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
530
  const dim3 block_nums(block_num_y, 1, 1);
531
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
532
- dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
533
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
534
  }
535
 
@@ -580,7 +592,7 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
580
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
581
  const dim3 block_nums(block_num_y, 1, 1);
582
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
583
- dequantize_mul_mat_vec<1, 1, convert_f16>
584
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
585
  }
586
 
 
422
  v.y = x[ib + iqs + 1];
423
  }
424
 
425
+ static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
426
+ return type == GGML_TYPE_Q4_0 ? dequantize_q4_0 :
427
+ type == GGML_TYPE_Q4_1 ? dequantize_q4_1 :
428
+ type == GGML_TYPE_Q5_0 ? dequantize_q5_0 :
429
+ type == GGML_TYPE_Q5_1 ? dequantize_q5_1 :
430
+ type == GGML_TYPE_Q8_0 ? dequantize_q8_0 :
431
+ type == GGML_TYPE_F16 ? convert_f16 :
432
+ nullptr;
433
+ }
434
+
435
+ template <ggml_type type>
436
  static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
437
+ constexpr int qk = ggml_cuda_type_traits<type>::qk; // quantized weights per x block
438
+ constexpr int qr = ggml_cuda_type_traits<type>::qr; // number of quantized weights per data value in x block
439
+ constexpr dequantize_kernel_t dequantize_kernel = get_dequantize_kernel(type);
440
+
441
  const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
442
 
443
  if (row >= nrows) {
 
505
  // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
506
  const dim3 block_nums(block_num_y, 1, 1);
507
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
508
+ dequantize_mul_mat_vec<GGML_TYPE_Q4_0>
509
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
510
  }
511
 
 
514
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
515
  const dim3 block_nums(block_num_y, 1, 1);
516
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
517
+ dequantize_mul_mat_vec<GGML_TYPE_Q4_1>
518
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
519
  }
520
 
 
523
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
524
  const dim3 block_nums(block_num_y, 1, 1);
525
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
526
+ dequantize_mul_mat_vec<GGML_TYPE_Q5_0>
527
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
528
  }
529
 
 
532
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
533
  const dim3 block_nums(block_num_y, 1, 1);
534
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
535
+ dequantize_mul_mat_vec<GGML_TYPE_Q5_1>
536
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
537
  }
538
 
 
541
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
542
  const dim3 block_nums(block_num_y, 1, 1);
543
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
544
+ dequantize_mul_mat_vec<GGML_TYPE_Q8_0>
545
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
546
  }
547
 
 
592
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
593
  const dim3 block_nums(block_num_y, 1, 1);
594
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
595
+ dequantize_mul_mat_vec<GGML_TYPE_F16>
596
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
597
  }
598
 
ggml-cuda/mmq.cu CHANGED
@@ -1,1450 +1,4 @@
1
  #include "mmq.cuh"
2
- #include "vecdotq.cuh"
3
-
4
- typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
5
- typedef void (*load_tiles_cuda_t)(
6
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
7
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
8
- typedef float (*vec_dot_q_mul_mat_cuda_t)(
9
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
10
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
11
- typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
12
- typedef void (mul_mat_q_t)(
13
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
14
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst);
15
-
16
- struct mmq_arch_config_t {
17
- int x;
18
- int y;
19
- int nwarps;
20
- };
21
-
22
- struct mmq_config_t {
23
- mmq_arch_config_t rdna2;
24
- mmq_arch_config_t rdna1;
25
- mmq_arch_config_t ampere;
26
- mmq_arch_config_t pascal;
27
- };
28
-
29
- constexpr mmq_config_t MMQ_CONFIG_Q4_0 = {
30
- // x y nwarps
31
- { 64, 128, 8},
32
- { 64, 64, 8},
33
- #ifdef CUDA_USE_TENSOR_CORES
34
- { 4, 32, 4},
35
- #else
36
- { 64, 128, 4},
37
- #endif // CUDA_USE_TENSOR_CORES
38
- { 64, 64, 8},
39
- };
40
- constexpr mmq_config_t MMQ_CONFIG_Q4_1 = {
41
- // x y nwarps
42
- { 64, 128, 8},
43
- { 64, 64, 8},
44
- #ifdef CUDA_USE_TENSOR_CORES
45
- { 4, 32, 4},
46
- #else
47
- { 64, 128, 4},
48
- #endif // CUDA_USE_TENSOR_CORES
49
- { 64, 64, 8},
50
- };
51
- constexpr mmq_config_t MMQ_CONFIG_Q5_0 = {
52
- // x y nwarps
53
- { 64, 128, 8},
54
- { 64, 64, 8},
55
- #ifdef CUDA_USE_TENSOR_CORES
56
- { 4, 32, 4},
57
- #else
58
- {128, 64, 4},
59
- #endif // CUDA_USE_TENSOR_CORES
60
- { 64, 64, 8},
61
- };
62
- constexpr mmq_config_t MMQ_CONFIG_Q5_1 = {
63
- // x y nwarps
64
- { 64, 128, 8},
65
- { 64, 64, 8},
66
- #ifdef CUDA_USE_TENSOR_CORES
67
- { 4, 32, 4},
68
- #else
69
- {128, 64, 4},
70
- #endif // CUDA_USE_TENSOR_CORES
71
- { 64, 64, 8},
72
- };
73
- constexpr mmq_config_t MMQ_CONFIG_Q8_0 = {
74
- // x y nwarps
75
- { 64, 128, 8},
76
- { 64, 64, 8},
77
- #ifdef CUDA_USE_TENSOR_CORES
78
- { 4, 32, 4},
79
- #else
80
- {128, 64, 4},
81
- #endif // CUDA_USE_TENSOR_CORES
82
- { 64, 64, 8},
83
- };
84
- constexpr mmq_config_t MMQ_CONFIG_Q2_K = {
85
- // x y nwarps
86
- { 64, 128, 8},
87
- {128, 32, 8},
88
- #ifdef CUDA_USE_TENSOR_CORES
89
- { 4, 32, 4},
90
- #else
91
- { 64, 128, 4},
92
- #endif // CUDA_USE_TENSOR_CORES
93
- { 64, 64, 8},
94
- };
95
- constexpr mmq_config_t MMQ_CONFIG_Q3_K = {
96
- // x y nwarps
97
- {128, 64, 8},
98
- { 32, 128, 8},
99
- #ifdef CUDA_USE_TENSOR_CORES
100
- { 4, 32, 4},
101
- #else
102
- {128, 128, 4},
103
- #endif // CUDA_USE_TENSOR_CORES
104
- { 64, 64, 8},
105
- };
106
- constexpr mmq_config_t MMQ_CONFIG_Q4_K = {
107
- // x y nwarps
108
- { 64, 128, 8},
109
- { 32, 64, 8},
110
- #ifdef CUDA_USE_TENSOR_CORES
111
- { 4, 32, 4},
112
- #else
113
- { 64, 128, 4},
114
- #endif // CUDA_USE_TENSOR_CORES
115
- { 64, 64, 8},
116
- };
117
- constexpr mmq_config_t MMQ_CONFIG_Q5_K = {
118
- // x y nwarps
119
- { 64, 128, 8},
120
- { 32, 64, 8},
121
- #ifdef CUDA_USE_TENSOR_CORES
122
- { 4, 32, 4},
123
- #else
124
- { 64, 128, 4},
125
- #endif // CUDA_USE_TENSOR_CORES
126
- { 64, 64, 8},
127
- };
128
- constexpr mmq_config_t MMQ_CONFIG_Q6_K = {
129
- // x y nwarps
130
- { 64, 128, 8},
131
- { 32, 64, 8},
132
- #ifdef CUDA_USE_TENSOR_CORES
133
- { 4, 32, 4},
134
- #else
135
- { 64, 64, 4},
136
- #endif // CUDA_USE_TENSOR_CORES
137
- { 64, 64, 8},
138
- };
139
-
140
- // ------------------------------------------------------------
141
-
142
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
143
- GGML_UNUSED(x_qh);
144
- GGML_UNUSED(x_sc);
145
-
146
- __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
147
- __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
148
-
149
- *x_ql = tile_x_qs;
150
- *x_dm = (half2 *) tile_x_d;
151
- }
152
-
153
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
154
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
155
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
156
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
157
- GGML_CUDA_ASSUME(i_offset >= 0);
158
- GGML_CUDA_ASSUME(i_offset < nwarps);
159
- GGML_CUDA_ASSUME(k >= 0);
160
- GGML_CUDA_ASSUME(k < WARP_SIZE);
161
-
162
- const int kbx = k / QI4_0;
163
- const int kqsx = k % QI4_0;
164
-
165
- const block_q4_0 * bx0 = (const block_q4_0 *) vx;
166
-
167
- float * x_dmf = (float *) x_dm;
168
-
169
- #pragma unroll
170
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
171
- int i = i0 + i_offset;
172
-
173
- if (need_check) {
174
- i = min(i, i_max);
175
- }
176
-
177
- const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
178
-
179
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
180
- // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
181
- }
182
-
183
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
184
- const int kbxd = k % blocks_per_tile_x_row;
185
-
186
- #pragma unroll
187
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
188
- int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
189
-
190
- if (need_check) {
191
- i = min(i, i_max);
192
- }
193
-
194
- const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
195
-
196
- x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
197
- }
198
- }
199
-
200
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
201
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
202
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
203
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
204
-
205
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
206
- const float * x_dmf = (const float *) x_dm;
207
-
208
- int u[2*VDR_Q4_0_Q8_1_MMQ];
209
-
210
- #pragma unroll
211
- for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
212
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
213
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
214
- }
215
-
216
- return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
217
- (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
218
- y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
219
- }
220
-
221
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
222
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
223
-
224
- __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
225
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
226
-
227
- *x_ql = tile_x_qs;
228
- *x_dm = tile_x_dm;
229
- }
230
-
231
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
232
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
233
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
234
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
235
-
236
- GGML_CUDA_ASSUME(i_offset >= 0);
237
- GGML_CUDA_ASSUME(i_offset < nwarps);
238
- GGML_CUDA_ASSUME(k >= 0);
239
- GGML_CUDA_ASSUME(k < WARP_SIZE);
240
-
241
- const int kbx = k / QI4_1;
242
- const int kqsx = k % QI4_1;
243
-
244
- const block_q4_1 * bx0 = (const block_q4_1 *) vx;
245
-
246
- #pragma unroll
247
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
248
- int i = i0 + i_offset;
249
-
250
- if (need_check) {
251
- i = min(i, i_max);
252
- }
253
-
254
- const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
255
-
256
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
257
- }
258
-
259
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
260
- const int kbxd = k % blocks_per_tile_x_row;
261
-
262
- #pragma unroll
263
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
264
- int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
265
-
266
- if (need_check) {
267
- i = min(i, i_max);
268
- }
269
-
270
- const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
271
-
272
- x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
273
- }
274
- }
275
-
276
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
277
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
278
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
279
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
280
-
281
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
282
-
283
- int u[2*VDR_Q4_1_Q8_1_MMQ];
284
-
285
- #pragma unroll
286
- for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
287
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
288
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
289
- }
290
-
291
- return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
292
- (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
293
- y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
294
- }
295
-
296
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
297
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
298
-
299
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
300
- __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
301
-
302
- *x_ql = tile_x_ql;
303
- *x_dm = (half2 *) tile_x_d;
304
- }
305
-
306
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
307
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
308
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
309
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
310
-
311
- GGML_CUDA_ASSUME(i_offset >= 0);
312
- GGML_CUDA_ASSUME(i_offset < nwarps);
313
- GGML_CUDA_ASSUME(k >= 0);
314
- GGML_CUDA_ASSUME(k < WARP_SIZE);
315
-
316
- const int kbx = k / QI5_0;
317
- const int kqsx = k % QI5_0;
318
-
319
- const block_q5_0 * bx0 = (const block_q5_0 *) vx;
320
-
321
- #pragma unroll
322
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
323
- int i = i0 + i_offset;
324
-
325
- if (need_check) {
326
- i = min(i, i_max);
327
- }
328
-
329
- const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
330
-
331
- const int ql = get_int_from_uint8(bxi->qs, kqsx);
332
- const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
333
-
334
- int qs0 = (ql >> 0) & 0x0F0F0F0F;
335
- qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
336
- qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
337
- qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
338
- qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
339
- qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
340
-
341
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
342
-
343
- int qs1 = (ql >> 4) & 0x0F0F0F0F;
344
- qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
345
- qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
346
- qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
347
- qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
348
- qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
349
-
350
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
351
- }
352
-
353
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
354
- const int kbxd = k % blocks_per_tile_x_row;
355
- float * x_dmf = (float *) x_dm;
356
-
357
- #pragma unroll
358
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
359
- int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
360
-
361
- if (need_check) {
362
- i = min(i, i_max);
363
- }
364
-
365
- const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
366
-
367
- x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
368
- }
369
- }
370
-
371
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
372
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
373
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
374
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
375
-
376
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
377
- const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
378
- const float * x_dmf = (const float *) x_dm;
379
- const float * y_df = (const float *) y_ds;
380
-
381
- int u[2*VDR_Q5_0_Q8_1_MMQ];
382
-
383
- #pragma unroll
384
- for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
385
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
386
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
387
- }
388
-
389
- return vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
390
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
391
- }
392
-
393
-
394
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
395
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
396
-
397
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
398
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
399
-
400
- *x_ql = tile_x_ql;
401
- *x_dm = tile_x_dm;
402
- }
403
-
404
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
405
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
406
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
407
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
408
-
409
- GGML_CUDA_ASSUME(i_offset >= 0);
410
- GGML_CUDA_ASSUME(i_offset < nwarps);
411
- GGML_CUDA_ASSUME(k >= 0);
412
- GGML_CUDA_ASSUME(k < WARP_SIZE);
413
-
414
- const int kbx = k / QI5_1;
415
- const int kqsx = k % QI5_1;
416
-
417
- const block_q5_1 * bx0 = (const block_q5_1 *) vx;
418
-
419
- #pragma unroll
420
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
421
- int i = i0 + i_offset;
422
-
423
- if (need_check) {
424
- i = min(i, i_max);
425
- }
426
-
427
- const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
428
-
429
- const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
430
- const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
431
-
432
- int qs0 = (ql >> 0) & 0x0F0F0F0F;
433
- qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
434
- qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
435
- qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
436
- qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
437
-
438
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
439
-
440
- int qs1 = (ql >> 4) & 0x0F0F0F0F;
441
- qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
442
- qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
443
- qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
444
- qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
445
-
446
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
447
- }
448
-
449
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
450
- const int kbxd = k % blocks_per_tile_x_row;
451
-
452
- #pragma unroll
453
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
454
- int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
455
-
456
- if (need_check) {
457
- i = min(i, i_max);
458
- }
459
-
460
- const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
461
-
462
- x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
463
- }
464
- }
465
-
466
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
467
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
468
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
469
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
470
-
471
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
472
- const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
473
-
474
- int u[2*VDR_Q5_1_Q8_1_MMQ];
475
-
476
- #pragma unroll
477
- for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
478
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
479
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
480
- }
481
-
482
- return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
483
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
484
- }
485
-
486
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
487
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
488
-
489
- __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
490
- __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
491
-
492
- *x_ql = tile_x_qs;
493
- *x_dm = (half2 *) tile_x_d;
494
- }
495
-
496
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
497
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
498
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
499
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
500
-
501
- GGML_CUDA_ASSUME(i_offset >= 0);
502
- GGML_CUDA_ASSUME(i_offset < nwarps);
503
- GGML_CUDA_ASSUME(k >= 0);
504
- GGML_CUDA_ASSUME(k < WARP_SIZE);
505
-
506
- const int kbx = k / QI8_0;
507
- const int kqsx = k % QI8_0;
508
- float * x_dmf = (float *) x_dm;
509
-
510
- const block_q8_0 * bx0 = (const block_q8_0 *) vx;
511
-
512
- #pragma unroll
513
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
514
- int i = i0 + i_offset;
515
-
516
- if (need_check) {
517
- i = min(i, i_max);
518
- }
519
-
520
- const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
521
-
522
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
523
- }
524
-
525
- const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
526
- const int kbxd = k % blocks_per_tile_x_row;
527
-
528
- #pragma unroll
529
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
530
- int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
531
-
532
- if (need_check) {
533
- i = min(i, i_max);
534
- }
535
-
536
- const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
537
-
538
- x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
539
- }
540
- }
541
-
542
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
543
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
544
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
545
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
546
-
547
- const float * x_dmf = (const float *) x_dm;
548
- const float * y_df = (const float *) y_ds;
549
-
550
- return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
551
- (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
552
- y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
553
- }
554
-
555
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
556
- GGML_UNUSED(x_qh);
557
-
558
- __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
559
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
560
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
561
-
562
- *x_ql = tile_x_ql;
563
- *x_dm = tile_x_dm;
564
- *x_sc = tile_x_sc;
565
- }
566
-
567
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
568
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
569
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
570
- GGML_UNUSED(x_qh);
571
-
572
- GGML_CUDA_ASSUME(i_offset >= 0);
573
- GGML_CUDA_ASSUME(i_offset < nwarps);
574
- GGML_CUDA_ASSUME(k >= 0);
575
- GGML_CUDA_ASSUME(k < WARP_SIZE);
576
-
577
- const int kbx = k / QI2_K;
578
- const int kqsx = k % QI2_K;
579
-
580
- const block_q2_K * bx0 = (const block_q2_K *) vx;
581
-
582
- #pragma unroll
583
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
584
- int i = i0 + i_offset;
585
-
586
- if (need_check) {
587
- i = min(i, i_max);
588
- }
589
-
590
- const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
591
-
592
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
593
- }
594
-
595
- const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
596
- const int kbxd = k % blocks_per_tile_x_row;
597
-
598
- #pragma unroll
599
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
600
- int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
601
-
602
- if (need_check) {
603
- i = min(i, i_max);
604
- }
605
-
606
- const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
607
-
608
- x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
609
- }
610
-
611
- #pragma unroll
612
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
613
- int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
614
-
615
- if (need_check) {
616
- i = min(i, i_max);
617
- }
618
-
619
- const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
620
-
621
- x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
622
- }
623
- }
624
-
625
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
626
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
627
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
628
- GGML_UNUSED(x_qh);
629
-
630
- const int kbx = k / QI2_K;
631
- const int ky = (k % QI2_K) * QR2_K;
632
- const float * y_df = (const float *) y_ds;
633
-
634
- int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
635
-
636
- const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
637
- const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
638
-
639
- #pragma unroll
640
- for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
641
- v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
642
- }
643
-
644
- const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
645
-
646
- const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
647
- return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
648
- }
649
-
650
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
651
-
652
- __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
653
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
654
- __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
655
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
656
-
657
- *x_ql = tile_x_ql;
658
- *x_dm = tile_x_dm;
659
- *x_qh = tile_x_qh;
660
- *x_sc = tile_x_sc;
661
- }
662
-
663
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
664
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
665
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
666
-
667
- GGML_CUDA_ASSUME(i_offset >= 0);
668
- GGML_CUDA_ASSUME(i_offset < nwarps);
669
- GGML_CUDA_ASSUME(k >= 0);
670
- GGML_CUDA_ASSUME(k < WARP_SIZE);
671
-
672
- const int kbx = k / QI3_K;
673
- const int kqsx = k % QI3_K;
674
-
675
- const block_q3_K * bx0 = (const block_q3_K *) vx;
676
-
677
- #pragma unroll
678
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
679
- int i = i0 + i_offset;
680
-
681
- if (need_check) {
682
- i = min(i, i_max);
683
- }
684
-
685
- const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
686
-
687
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
688
- }
689
-
690
- const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
691
- const int kbxd = k % blocks_per_tile_x_row;
692
- float * x_dmf = (float *) x_dm;
693
-
694
- #pragma unroll
695
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
696
- int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
697
-
698
- if (need_check) {
699
- i = min(i, i_max);
700
- }
701
-
702
- const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
703
-
704
- x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
705
- }
706
-
707
- #pragma unroll
708
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
709
- int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
710
-
711
- if (need_check) {
712
- i = min(i, i_max);
713
- }
714
-
715
- const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
716
-
717
- // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
718
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
719
- }
720
-
721
- #pragma unroll
722
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
723
- int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
724
-
725
- if (need_check) {
726
- i = min(i, i_max);
727
- }
728
-
729
- const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
730
-
731
- const int ksc = k % (QI3_K/4);
732
-
733
- const int ksc_low = ksc % (QI3_K/8);
734
- const int shift_low = 4 * (ksc / (QI3_K/8));
735
- const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
736
-
737
- const int ksc_high = QI3_K/8;
738
- const int shift_high = 2 * ksc;
739
- const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
740
-
741
- const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
742
-
743
- x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
744
- }
745
- }
746
-
747
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
748
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
749
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
750
-
751
- const int kbx = k / QI3_K;
752
- const int ky = (k % QI3_K) * QR3_K;
753
- const float * x_dmf = (const float *) x_dm;
754
- const float * y_df = (const float *) y_ds;
755
-
756
- const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
757
-
758
- int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
759
-
760
- #pragma unroll
761
- for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
762
- const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
763
- const int shift = 2 * ((ky % 32) / 8);
764
- const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
765
-
766
- const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
767
- const int vlh = (vh << 2) & 0x04040404;
768
-
769
- v[l] = __vsubss4(vll, vlh);
770
- }
771
-
772
- const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
773
- return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
774
- }
775
-
776
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
777
- GGML_UNUSED(x_qh);
778
-
779
- __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
780
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
781
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
782
-
783
- *x_ql = tile_x_ql;
784
- *x_dm = tile_x_dm;
785
- *x_sc = tile_x_sc;
786
- }
787
-
788
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
789
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
790
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
791
- GGML_UNUSED(x_qh);
792
-
793
- GGML_CUDA_ASSUME(i_offset >= 0);
794
- GGML_CUDA_ASSUME(i_offset < nwarps);
795
- GGML_CUDA_ASSUME(k >= 0);
796
- GGML_CUDA_ASSUME(k < WARP_SIZE);
797
-
798
- const int kbx = k / QI4_K; // == 0 if QK_K == 256
799
- const int kqsx = k % QI4_K; // == k if QK_K == 256
800
-
801
- const block_q4_K * bx0 = (const block_q4_K *) vx;
802
-
803
- #pragma unroll
804
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
805
- int i = i0 + i_offset;
806
-
807
- if (need_check) {
808
- i = min(i, i_max);
809
- }
810
-
811
- const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
812
-
813
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
814
- }
815
-
816
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
817
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
818
-
819
- #pragma unroll
820
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
821
- int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
822
-
823
- if (need_check) {
824
- i = min(i, i_max);
825
- }
826
-
827
- const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
828
-
829
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
830
- }
831
-
832
- #pragma unroll
833
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
834
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
835
-
836
- if (need_check) {
837
- i = min(i, i_max);
838
- }
839
-
840
- const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
841
-
842
- const int * scales = (const int *) bxi->scales;
843
-
844
- const int ksc = k % (WARP_SIZE/8);
845
-
846
- // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
847
- int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
848
- scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
849
-
850
- x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
851
- }
852
- }
853
-
854
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
855
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
856
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
857
- GGML_UNUSED(x_qh);
858
-
859
- const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
860
-
861
- const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
862
- return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
863
- x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
864
- }
865
-
866
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
867
- GGML_UNUSED(x_qh);
868
-
869
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
870
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
871
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
872
-
873
- *x_ql = tile_x_ql;
874
- *x_dm = tile_x_dm;
875
- *x_sc = tile_x_sc;
876
- }
877
-
878
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
879
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
880
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
881
- GGML_UNUSED(x_qh);
882
-
883
- GGML_CUDA_ASSUME(i_offset >= 0);
884
- GGML_CUDA_ASSUME(i_offset < nwarps);
885
- GGML_CUDA_ASSUME(k >= 0);
886
- GGML_CUDA_ASSUME(k < WARP_SIZE);
887
-
888
- const int kbx = k / QI5_K; // == 0 if QK_K == 256
889
- const int kqsx = k % QI5_K; // == k if QK_K == 256
890
-
891
- const block_q5_K * bx0 = (const block_q5_K *) vx;
892
-
893
- #pragma unroll
894
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
895
- int i = i0 + i_offset;
896
-
897
- if (need_check) {
898
- i = min(i, i_max);
899
- }
900
-
901
- const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
902
- const int ky = QR5_K*kqsx;
903
-
904
- const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
905
- const int ql0 = (ql >> 0) & 0x0F0F0F0F;
906
- const int ql1 = (ql >> 4) & 0x0F0F0F0F;
907
-
908
- const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
909
- const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
910
- const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
911
-
912
- const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
913
- const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
914
-
915
- x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
916
- x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
917
- }
918
-
919
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
920
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
921
-
922
- #pragma unroll
923
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
924
- int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
925
-
926
- if (need_check) {
927
- i = min(i, i_max);
928
- }
929
-
930
- const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
931
-
932
- x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
933
- }
934
-
935
- #pragma unroll
936
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
937
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
938
-
939
- if (need_check) {
940
- i = min(i, i_max);
941
- }
942
-
943
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
944
-
945
- const int * scales = (const int *) bxi->scales;
946
-
947
- const int ksc = k % (WARP_SIZE/8);
948
-
949
- // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
950
- int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
951
- scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
952
-
953
- x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
954
- }
955
- }
956
-
957
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
958
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
959
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
960
- GGML_UNUSED(x_qh);
961
-
962
- const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
963
-
964
- const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
965
- const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
966
- return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
967
- x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
968
- }
969
-
970
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
971
- GGML_UNUSED(x_qh);
972
-
973
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
974
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
975
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
976
-
977
- *x_ql = tile_x_ql;
978
- *x_dm = tile_x_dm;
979
- *x_sc = tile_x_sc;
980
- }
981
-
982
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
983
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
984
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
985
- GGML_UNUSED(x_qh);
986
-
987
- GGML_CUDA_ASSUME(i_offset >= 0);
988
- GGML_CUDA_ASSUME(i_offset < nwarps);
989
- GGML_CUDA_ASSUME(k >= 0);
990
- GGML_CUDA_ASSUME(k < WARP_SIZE);
991
-
992
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
993
- const int kqsx = k % QI6_K; // == k if QK_K == 256
994
-
995
- const block_q6_K * bx0 = (const block_q6_K *) vx;
996
-
997
- #pragma unroll
998
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
999
- int i = i0 + i_offset;
1000
-
1001
- if (need_check) {
1002
- i = min(i, i_max);
1003
- }
1004
-
1005
- const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
1006
- const int ky = QR6_K*kqsx;
1007
-
1008
- const int ql = get_int_from_uint8(bxi->ql, kqsx);
1009
- const int ql0 = (ql >> 0) & 0x0F0F0F0F;
1010
- const int ql1 = (ql >> 4) & 0x0F0F0F0F;
1011
-
1012
- const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
1013
- const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
1014
- const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
1015
-
1016
- const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
1017
- const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
1018
-
1019
- x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
1020
- x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
1021
- }
1022
-
1023
- const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
1024
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
1025
- float * x_dmf = (float *) x_dm;
1026
-
1027
- #pragma unroll
1028
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
1029
- int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
1030
-
1031
- if (need_check) {
1032
- i = min(i, i_max);
1033
- }
1034
-
1035
- const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
1036
-
1037
- x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
1038
- }
1039
-
1040
- #pragma unroll
1041
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
1042
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
1043
-
1044
- if (need_check) {
1045
- i = min(i, i_max);
1046
- }
1047
-
1048
- const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
1049
-
1050
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
1051
- }
1052
- }
1053
-
1054
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
1055
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1056
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1057
- GGML_UNUSED(x_qh);
1058
-
1059
- const float * x_dmf = (const float *) x_dm;
1060
- const float * y_df = (const float *) y_ds;
1061
-
1062
- const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
1063
-
1064
- const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
1065
- const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
1066
- return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
1067
- }
1068
-
1069
- template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
1070
- allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
1071
- static __device__ __forceinline__ void mul_mat_q(
1072
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1073
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1074
-
1075
- const block_q_t * x = (const block_q_t *) vx;
1076
- const block_q8_1 * y = (const block_q8_1 *) vy;
1077
-
1078
- const int blocks_per_row_x = ncols_x / qk;
1079
- const int blocks_per_col_y = nrows_y / QK8_1;
1080
- const int blocks_per_warp = WARP_SIZE / qi;
1081
-
1082
- const int & ncols_dst = ncols_y;
1083
-
1084
- const int row_dst_0 = blockIdx.x*mmq_y;
1085
- const int & row_x_0 = row_dst_0;
1086
-
1087
- const int col_dst_0 = blockIdx.y*mmq_x;
1088
- const int & col_y_0 = col_dst_0;
1089
-
1090
- int * tile_x_ql = nullptr;
1091
- half2 * tile_x_dm = nullptr;
1092
- int * tile_x_qh = nullptr;
1093
- int * tile_x_sc = nullptr;
1094
-
1095
- allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
1096
-
1097
- __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
1098
- __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
1099
-
1100
- float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
1101
-
1102
- for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
1103
-
1104
- load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
1105
- threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
1106
-
1107
- #pragma unroll
1108
- for (int ir = 0; ir < qr; ++ir) {
1109
- const int kqs = ir*WARP_SIZE + threadIdx.x;
1110
- const int kbxd = kqs / QI8_1;
1111
-
1112
- #pragma unroll
1113
- for (int i = 0; i < mmq_x; i += nwarps) {
1114
- const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
1115
-
1116
- const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
1117
-
1118
- const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
1119
- tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
1120
- }
1121
-
1122
- #pragma unroll
1123
- for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
1124
- const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
1125
- const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
1126
- const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
1127
-
1128
- // if the sum is not needed it's faster to transform the scale to f32 ahead of time
1129
- const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
1130
- half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
1131
- if (need_sum) {
1132
- *dsi_dst = *dsi_src;
1133
- } else {
1134
- float * dfi_dst = (float *) dsi_dst;
1135
- *dfi_dst = __low2float(*dsi_src);
1136
- }
1137
- }
1138
-
1139
- __syncthreads();
1140
-
1141
- // #pragma unroll // unrolling this loop causes too much register pressure
1142
- for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
1143
- #pragma unroll
1144
- for (int j = 0; j < mmq_x; j += nwarps) {
1145
- #pragma unroll
1146
- for (int i = 0; i < mmq_y; i += WARP_SIZE) {
1147
- sum[i/WARP_SIZE][j/nwarps] += vec_dot(
1148
- tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
1149
- threadIdx.x + i, threadIdx.y + j, k);
1150
- }
1151
- }
1152
- }
1153
-
1154
- __syncthreads();
1155
- }
1156
- }
1157
-
1158
- #pragma unroll
1159
- for (int j = 0; j < mmq_x; j += nwarps) {
1160
- const int col_dst = col_dst_0 + j + threadIdx.y;
1161
-
1162
- if (col_dst >= ncols_dst) {
1163
- return;
1164
- }
1165
-
1166
- #pragma unroll
1167
- for (int i = 0; i < mmq_y; i += WARP_SIZE) {
1168
- const int row_dst = row_dst_0 + threadIdx.x + i;
1169
-
1170
- if (row_dst >= nrows_dst) {
1171
- continue;
1172
- }
1173
-
1174
- dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
1175
- }
1176
- }
1177
- }
1178
-
1179
- static constexpr __device__ mmq_arch_config_t get_arch_config_device(mmq_config_t mmq_config) {
1180
-
1181
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1182
-
1183
- #if defined(RDNA3) || defined(RDNA2)
1184
- return mmq_config.rdna2;
1185
- #else
1186
- return mmq_config.rdna1;
1187
- #endif // defined(RDNA3) || defined(RDNA2)
1188
-
1189
- #else
1190
-
1191
- #if __CUDA_ARCH__ >= CC_VOLTA
1192
- return mmq_config.ampere;
1193
- #else
1194
- return mmq_config.pascal;
1195
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1196
-
1197
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1198
- }
1199
-
1200
- template <bool need_check> static __global__ void
1201
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1202
- #if defined(RDNA3) || defined(RDNA2)
1203
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_0.rdna2.nwarps, 2)
1204
- #endif // defined(RDNA3) || defined(RDNA2)
1205
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1206
- mul_mat_q4_0(
1207
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1208
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1209
-
1210
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1211
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_0);
1212
-
1213
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_0<arch_config.y>,
1214
- load_tiles_q4_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
1215
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1216
- #else
1217
- GGML_UNUSED(get_arch_config_device);
1218
- GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
1219
- NO_DEVICE_CODE;
1220
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1221
- }
1222
-
1223
- template <bool need_check> static __global__ void
1224
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1225
- #if defined(RDNA3) || defined(RDNA2)
1226
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.rdna2.nwarps, 2)
1227
- #endif // defined(RDNA3) || defined(RDNA2)
1228
- #elif __CUDA_ARCH__ < CC_VOLTA
1229
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.pascal.nwarps, 2)
1230
- #endif // __CUDA_ARCH__ < CC_VOLTA
1231
- mul_mat_q4_1(
1232
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1233
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1234
-
1235
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1236
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_1);
1237
-
1238
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_1<arch_config.y>,
1239
- load_tiles_q4_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
1240
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1241
- #else
1242
- GGML_UNUSED(get_arch_config_device);
1243
- GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
1244
- NO_DEVICE_CODE;
1245
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1246
- }
1247
-
1248
- template <bool need_check> static __global__ void
1249
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1250
- #if defined(RDNA3) || defined(RDNA2)
1251
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_0.rdna2.nwarps, 2)
1252
- #endif // defined(RDNA3) || defined(RDNA2)
1253
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1254
- mul_mat_q5_0(
1255
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1256
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1257
-
1258
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1259
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_0);
1260
-
1261
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_0<arch_config.y>,
1262
- load_tiles_q5_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
1263
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1264
- #else
1265
- GGML_UNUSED(get_arch_config_device);
1266
- GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
1267
- NO_DEVICE_CODE;
1268
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1269
- }
1270
-
1271
- template <bool need_check> static __global__ void
1272
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1273
- #if defined(RDNA3) || defined(RDNA2)
1274
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_1.rdna2.nwarps, 2)
1275
- #endif // defined(RDNA3) || defined(RDNA2)
1276
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1277
- mul_mat_q5_1(
1278
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1279
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1280
-
1281
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1282
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_1);
1283
-
1284
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_1<arch_config.y>,
1285
- load_tiles_q5_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
1286
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1287
- #else
1288
- GGML_UNUSED(get_arch_config_device);
1289
- GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
1290
- NO_DEVICE_CODE;
1291
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1292
- }
1293
-
1294
- template <bool need_check> static __global__ void
1295
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1296
- #if defined(RDNA3) || defined(RDNA2)
1297
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q8_0.rdna2.nwarps, 2)
1298
- #endif // defined(RDNA3) || defined(RDNA2)
1299
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1300
- mul_mat_q8_0(
1301
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1302
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1303
-
1304
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1305
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q8_0);
1306
-
1307
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q8_0<arch_config.y>,
1308
- load_tiles_q8_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
1309
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1310
- #else
1311
- GGML_UNUSED(get_arch_config_device);
1312
- GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
1313
- NO_DEVICE_CODE;
1314
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1315
- }
1316
-
1317
- template <bool need_check> static __global__ void
1318
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1319
- #if defined(RDNA3) || defined(RDNA2)
1320
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q2_K.rdna2.nwarps, 2)
1321
- #endif // defined(RDNA3) || defined(RDNA2)
1322
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1323
- mul_mat_q2_K(
1324
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1325
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1326
-
1327
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1328
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q2_K);
1329
-
1330
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q2_K<arch_config.y>,
1331
- load_tiles_q2_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
1332
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1333
- #else
1334
- GGML_UNUSED(get_arch_config_device);
1335
- GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
1336
- NO_DEVICE_CODE;
1337
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1338
- }
1339
-
1340
- template <bool need_check> static __global__ void
1341
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1342
- #if defined(RDNA3) || defined(RDNA2)
1343
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.rdna2.nwarps, 2)
1344
- #endif // defined(RDNA3) || defined(RDNA2)
1345
- #elif __CUDA_ARCH__ < CC_VOLTA
1346
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.pascal.nwarps, 2)
1347
- #endif // __CUDA_ARCH__ < CC_VOLTA
1348
- mul_mat_q3_K(
1349
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1350
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1351
-
1352
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1353
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q3_K);
1354
-
1355
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q3_K<arch_config.y>,
1356
- load_tiles_q3_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
1357
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1358
- #else
1359
- GGML_UNUSED(get_arch_config_device);
1360
- GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
1361
- NO_DEVICE_CODE;
1362
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1363
- }
1364
-
1365
- template <bool need_check> static __global__ void
1366
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1367
- #if defined(RDNA3) || defined(RDNA2)
1368
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.rdna2.nwarps, 2)
1369
- #endif // defined(RDNA3) || defined(RDNA2)
1370
- #elif __CUDA_ARCH__ < CC_VOLTA
1371
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
1372
- #endif // __CUDA_ARCH__ < CC_VOLTA
1373
- mul_mat_q4_K(
1374
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1375
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1376
-
1377
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1378
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_K);
1379
-
1380
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_K<arch_config.y>,
1381
- load_tiles_q4_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
1382
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1383
- #else
1384
- GGML_UNUSED(get_arch_config_device);
1385
- GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
1386
- NO_DEVICE_CODE;
1387
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1388
- }
1389
-
1390
- template <bool need_check> static __global__ void
1391
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1392
- #if defined(RDNA3) || defined(RDNA2)
1393
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_K.rdna2.nwarps, 2)
1394
- #endif // defined(RDNA3) || defined(RDNA2)
1395
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1396
- mul_mat_q5_K(
1397
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1398
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1399
-
1400
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1401
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_K);
1402
-
1403
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_K<arch_config.y>,
1404
- load_tiles_q5_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
1405
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1406
- #else
1407
- GGML_UNUSED(get_arch_config_device);
1408
- GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
1409
- NO_DEVICE_CODE;
1410
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1411
- }
1412
-
1413
- template <bool need_check> static __global__ void
1414
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1415
- #if defined(RDNA3) || defined(RDNA2)
1416
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q6_K.rdna2.nwarps, 2)
1417
- #endif // defined(RDNA3) || defined(RDNA2)
1418
- #elif __CUDA_ARCH__ < CC_VOLTA
1419
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
1420
- #endif // __CUDA_ARCH__ < CC_VOLTA
1421
- mul_mat_q6_K(
1422
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1423
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1424
-
1425
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1426
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q6_K);
1427
-
1428
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q6_K<arch_config.y>,
1429
- load_tiles_q6_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
1430
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1431
- #else
1432
- GGML_UNUSED(get_arch_config_device);
1433
- GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
1434
- NO_DEVICE_CODE;
1435
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1436
- }
1437
-
1438
- #define MMQ_SWITCH_CASE(type_suffix) \
1439
- case GGML_TYPE_Q##type_suffix: if (row_diff % arch_config.y == 0) { \
1440
- const bool need_check = false; \
1441
- mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
1442
- (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
1443
- } else { \
1444
- const bool need_check = true; \
1445
- mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
1446
- (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
1447
- } break; \
1448
 
1449
  void ggml_cuda_op_mul_mat_q(
1450
  ggml_backend_cuda_context & ctx,
@@ -1454,12 +8,15 @@ void ggml_cuda_op_mul_mat_q(
1454
 
1455
  const int64_t ne00 = src0->ne[0];
1456
 
 
 
1457
  const int64_t ne10 = src1->ne[0];
1458
  GGML_ASSERT(ne10 % QK8_1 == 0);
1459
 
1460
  const int64_t ne0 = dst->ne[0];
1461
 
1462
  const int64_t row_diff = row_high - row_low;
 
1463
 
1464
  int id = ggml_cuda_get_device();
1465
  const int compute_capability = ggml_cuda_info().devices[id].cc;
@@ -1468,73 +25,39 @@ void ggml_cuda_op_mul_mat_q(
1468
  // nrows_dst == nrows of the matrix that the kernel writes into
1469
  const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
1470
 
1471
- mmq_config_t mmq_config;
1472
 
1473
  switch (src0->type) {
1474
  case GGML_TYPE_Q4_0:
1475
- mmq_config = MMQ_CONFIG_Q4_0;
1476
  break;
1477
  case GGML_TYPE_Q4_1:
1478
- mmq_config = MMQ_CONFIG_Q4_1;
1479
  break;
1480
  case GGML_TYPE_Q5_0:
1481
- mmq_config = MMQ_CONFIG_Q5_0;
1482
  break;
1483
  case GGML_TYPE_Q5_1:
1484
- mmq_config = MMQ_CONFIG_Q5_1;
1485
  break;
1486
  case GGML_TYPE_Q8_0:
1487
- mmq_config = MMQ_CONFIG_Q8_0;
1488
  break;
1489
  case GGML_TYPE_Q2_K:
1490
- mmq_config = MMQ_CONFIG_Q2_K;
1491
  break;
1492
  case GGML_TYPE_Q3_K:
1493
- mmq_config = MMQ_CONFIG_Q3_K;
1494
  break;
1495
  case GGML_TYPE_Q4_K:
1496
- mmq_config = MMQ_CONFIG_Q4_K;
1497
  break;
1498
  case GGML_TYPE_Q5_K:
1499
- mmq_config = MMQ_CONFIG_Q5_K;
1500
  break;
1501
  case GGML_TYPE_Q6_K:
1502
- mmq_config = MMQ_CONFIG_Q6_K;
1503
- break;
1504
- default:
1505
- GGML_ASSERT(false);
1506
  break;
1507
- }
1508
-
1509
- mmq_arch_config_t arch_config;
1510
- if (compute_capability >= CC_RDNA2) {
1511
- arch_config = mmq_config.rdna2;
1512
- } else if (compute_capability >= CC_OFFSET_AMD) {
1513
- arch_config = mmq_config.rdna1;
1514
- } else if (compute_capability >= CC_VOLTA) {
1515
- arch_config = mmq_config.ampere;
1516
- } else if (compute_capability >= MIN_CC_DP4A) {
1517
- arch_config = mmq_config.pascal;
1518
- } else {
1519
- GGML_ASSERT(false);
1520
- }
1521
-
1522
- const int block_num_x = (row_diff + arch_config.y - 1) / arch_config.y;
1523
- const int block_num_y = (src1_ncols + arch_config.x - 1) / arch_config.x;
1524
- const dim3 block_nums(block_num_x, block_num_y, 1);
1525
- const dim3 block_dims(WARP_SIZE, arch_config.nwarps, 1);
1526
-
1527
- switch (src0->type) {
1528
- MMQ_SWITCH_CASE(4_0)
1529
- MMQ_SWITCH_CASE(4_1)
1530
- MMQ_SWITCH_CASE(5_0)
1531
- MMQ_SWITCH_CASE(5_1)
1532
- MMQ_SWITCH_CASE(8_0)
1533
- MMQ_SWITCH_CASE(2_K)
1534
- MMQ_SWITCH_CASE(3_K)
1535
- MMQ_SWITCH_CASE(4_K)
1536
- MMQ_SWITCH_CASE(5_K)
1537
- MMQ_SWITCH_CASE(6_K)
1538
  default:
1539
  GGML_ASSERT(false);
1540
  break;
 
1
  #include "mmq.cuh"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  void ggml_cuda_op_mul_mat_q(
4
  ggml_backend_cuda_context & ctx,
 
8
 
9
  const int64_t ne00 = src0->ne[0];
10
 
11
+ const int64_t nb01 = src0->nb[1];
12
+
13
  const int64_t ne10 = src1->ne[0];
14
  GGML_ASSERT(ne10 % QK8_1 == 0);
15
 
16
  const int64_t ne0 = dst->ne[0];
17
 
18
  const int64_t row_diff = row_high - row_low;
19
+ const int64_t stride00 = nb01 / ggml_type_size(src0->type);
20
 
21
  int id = ggml_cuda_get_device();
22
  const int compute_capability = ggml_cuda_info().devices[id].cc;
 
25
  // nrows_dst == nrows of the matrix that the kernel writes into
26
  const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
27
 
28
+ const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
29
 
30
  switch (src0->type) {
31
  case GGML_TYPE_Q4_0:
32
+ mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
33
  break;
34
  case GGML_TYPE_Q4_1:
35
+ mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
36
  break;
37
  case GGML_TYPE_Q5_0:
38
+ mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
39
  break;
40
  case GGML_TYPE_Q5_1:
41
+ mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
42
  break;
43
  case GGML_TYPE_Q8_0:
44
+ mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
45
  break;
46
  case GGML_TYPE_Q2_K:
47
+ mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
48
  break;
49
  case GGML_TYPE_Q3_K:
50
+ mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
51
  break;
52
  case GGML_TYPE_Q4_K:
53
+ mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
54
  break;
55
  case GGML_TYPE_Q5_K:
56
+ mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
57
  break;
58
  case GGML_TYPE_Q6_K:
59
+ mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
 
 
 
60
  break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  default:
62
  GGML_ASSERT(false);
63
  break;
ggml-cuda/mmq.cuh CHANGED
@@ -1,4 +1,1304 @@
1
  #include "common.cuh"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  void ggml_cuda_op_mul_mat_q(
4
  ggml_backend_cuda_context & ctx,
 
1
  #include "common.cuh"
2
+ #include "vecdotq.cuh"
3
+
4
+ #include <climits>
5
+ #include <cstdint>
6
+
7
+ typedef void (*load_tiles_mmq_t)(
8
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
9
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride);
10
+ typedef void (*vec_dot_mmq_t)(
11
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
12
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, float * __restrict__ sum, const int & k0);
13
+
14
+ struct tile_x_sizes {
15
+ int ql;
16
+ int dm;
17
+ int qh;
18
+ int sc;
19
+ };
20
+
21
+ // get_mmq_x_max_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
22
+
23
+ static constexpr __device__ int get_mmq_x_max_device() {
24
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
25
+ return 64;
26
+ #else
27
+ #if __CUDA_ARCH__ >= CC_VOLTA
28
+ #ifdef CUDA_USE_TENSOR_CORES
29
+ return MMQ_MAX_BATCH_SIZE;
30
+ #else
31
+ return 128;
32
+ #endif // CUDA_USE_TENSOR_CORES
33
+ #else
34
+ return 64;
35
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
36
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
37
+ }
38
+
39
+ // get_mmq_y_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
40
+
41
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
42
+ static constexpr __device__ int get_mmq_y_device(int mmq_x) {
43
+ return mmq_x >= 32 ? 128 : 64;
44
+ }
45
+ #else
46
+ #if __CUDA_ARCH__ >= CC_VOLTA
47
+ static constexpr __device__ int get_mmq_y_device(int mmq_x) {
48
+ return mmq_x >= 32 ? 128 : 64;
49
+ }
50
+ #else
51
+ static constexpr __device__ int get_mmq_y_device(int /*mmq_x*/) {
52
+ return 64;
53
+ }
54
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
55
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
56
+
57
+ #define TILE_X_SIZES_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0, 0}
58
+ #define TILE_X_SIZES_Q4_1 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_1 + mmq_y/QI4_1, 0, 0}
59
+ #define TILE_X_SIZES_Q5_0 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_0 + mmq_y/QI5_0, 0, 0}
60
+ #define TILE_X_SIZES_Q5_1 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_1 + mmq_y/QI5_1, 0, 0}
61
+ #define TILE_X_SIZES_Q8_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI8_0 + mmq_y/QI8_0, 0, 0}
62
+ #define TILE_X_SIZES_Q2_K tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI2_K + mmq_y/QI2_K, 0, mmq_y*WARP_SIZE/4 + mmq_y/4}
63
+ #define TILE_X_SIZES_Q3_K tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI3_K + mmq_y/QI3_K, mmq_y*WARP_SIZE/2 + mmq_y/2, mmq_y*WARP_SIZE/4 + mmq_y/4}
64
+ #define TILE_X_SIZES_Q4_K tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_K + mmq_y/QI4_K, 0, mmq_y*WARP_SIZE/8 + mmq_y/8}
65
+ #define TILE_X_SIZES_Q5_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_K + mmq_y/QI5_K, 0, mmq_y*WARP_SIZE/8 + mmq_y/8}
66
+ #define TILE_X_SIZES_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, 0, mmq_y*WARP_SIZE/8 + mmq_y/8}
67
+
68
+ #define GET_TILE_X_SIZES_BODY \
69
+ return type == GGML_TYPE_Q4_0 ? TILE_X_SIZES_Q4_0 : \
70
+ type == GGML_TYPE_Q4_1 ? TILE_X_SIZES_Q4_1 : \
71
+ type == GGML_TYPE_Q5_0 ? TILE_X_SIZES_Q5_0 : \
72
+ type == GGML_TYPE_Q5_1 ? TILE_X_SIZES_Q5_1 : \
73
+ type == GGML_TYPE_Q8_0 ? TILE_X_SIZES_Q8_0 : \
74
+ type == GGML_TYPE_Q2_K ? TILE_X_SIZES_Q2_K : \
75
+ type == GGML_TYPE_Q3_K ? TILE_X_SIZES_Q3_K : \
76
+ type == GGML_TYPE_Q4_K ? TILE_X_SIZES_Q4_K : \
77
+ type == GGML_TYPE_Q5_K ? TILE_X_SIZES_Q5_K : \
78
+ type == GGML_TYPE_Q6_K ? TILE_X_SIZES_Q6_K : \
79
+ tile_x_sizes{0, 0, 0, 0}
80
+
81
+ static tile_x_sizes get_tile_x_sizes_host(const ggml_type type, const int mmq_y) {
82
+ GET_TILE_X_SIZES_BODY;
83
+ }
84
+
85
+ template <int mmq_y>
86
+ static constexpr __device__ tile_x_sizes get_tile_x_sizes_device(ggml_type type) {
87
+ GET_TILE_X_SIZES_BODY;
88
+ }
89
+
90
+ // ------------------------------------------------------------
91
+
92
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
93
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
94
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
95
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
96
+
97
+ const int kbx = threadIdx.x / QI4_0;
98
+ const int kqsx = threadIdx.x % QI4_0;
99
+
100
+ float * x_dmf = (float *) x_dm;
101
+
102
+ #pragma unroll
103
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
104
+ int i = i0 + threadIdx.y;
105
+
106
+ if (need_check) {
107
+ i = min(i, i_max);
108
+ }
109
+
110
+ const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbx;
111
+
112
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8(bxi->qs, kqsx);
113
+ }
114
+
115
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
116
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
117
+
118
+ #pragma unroll
119
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
120
+ int i = i0 + threadIdx.y * QI4_0 + threadIdx.x / blocks_per_tile_x_row;
121
+
122
+ if (need_check) {
123
+ i = min(i, i_max);
124
+ }
125
+
126
+ const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbxd;
127
+
128
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
129
+ }
130
+ }
131
+
132
+ template <int mmq_x, int mmq_y, int nwarps>
133
+ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
134
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
135
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
136
+
137
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
138
+
139
+ #pragma unroll
140
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
141
+ const int j = j0 + threadIdx.y;
142
+
143
+ #pragma unroll
144
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
145
+ const int i = i0 + threadIdx.x;
146
+
147
+ const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
148
+ const float * x_dmf = (const float *) x_dm;
149
+
150
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
151
+
152
+ #pragma unroll
153
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
154
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
155
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
156
+ }
157
+
158
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
159
+ (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
160
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
161
+ }
162
+ }
163
+ }
164
+
165
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
166
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
167
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
168
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
169
+
170
+ const int kbx = threadIdx.x / QI4_1;
171
+ const int kqsx = threadIdx.x % QI4_1;
172
+
173
+ #pragma unroll
174
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
175
+ int i = i0 + threadIdx.y;
176
+
177
+ if (need_check) {
178
+ i = min(i, i_max);
179
+ }
180
+
181
+ const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbx;
182
+
183
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8_aligned(bxi->qs, kqsx);
184
+ }
185
+
186
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
187
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
188
+
189
+ #pragma unroll
190
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
191
+ int i = i0 + threadIdx.y * QI4_1 + threadIdx.x / blocks_per_tile_x_row;
192
+
193
+ if (need_check) {
194
+ i = min(i, i_max);
195
+ }
196
+
197
+ const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbxd;
198
+
199
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
200
+ }
201
+ }
202
+
203
+ template <int mmq_x, int mmq_y, int nwarps>
204
+ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
205
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
206
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
207
+
208
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
209
+
210
+ #pragma unroll
211
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
212
+ const int j = j0 + threadIdx.y;
213
+
214
+ #pragma unroll
215
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
216
+ const int i = i0 + threadIdx.x;
217
+
218
+ const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
219
+
220
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
221
+
222
+ #pragma unroll
223
+ for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
224
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
225
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
226
+ }
227
+
228
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
229
+ (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
230
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
231
+ }
232
+ }
233
+ }
234
+
235
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
236
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
237
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
238
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
239
+
240
+ const int kbx = threadIdx.x / QI5_0;
241
+ const int kqsx = threadIdx.x % QI5_0;
242
+
243
+ #pragma unroll
244
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
245
+ int i = i0 + threadIdx.y;
246
+
247
+ if (need_check) {
248
+ i = min(i, i_max);
249
+ }
250
+
251
+ const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbx;
252
+
253
+ const int ql = get_int_from_uint8(bxi->qs, kqsx);
254
+ const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_0));
255
+
256
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
257
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
258
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
259
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
260
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
261
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
262
+
263
+ x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+0] = qs0;
264
+
265
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
266
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
267
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
268
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
269
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
270
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
271
+
272
+ x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+1] = qs1;
273
+ }
274
+
275
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
276
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
277
+ float * x_dmf = (float *) x_dm;
278
+
279
+ #pragma unroll
280
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
281
+ int i = i0 + threadIdx.y * QI5_0 + threadIdx.x / blocks_per_tile_x_row;
282
+
283
+ if (need_check) {
284
+ i = min(i, i_max);
285
+ }
286
+
287
+ const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbxd;
288
+
289
+ x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
290
+ }
291
+ }
292
+
293
+ template <int mmq_x, int mmq_y, int nwarps>
294
+ static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
295
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
296
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
297
+
298
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
299
+
300
+ #pragma unroll
301
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
302
+ const int j = j0 + threadIdx.y;
303
+
304
+ #pragma unroll
305
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
306
+ const int i = i0 + threadIdx.x;
307
+
308
+ const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
309
+ const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
310
+ const float * x_dmf = (const float *) x_dm;
311
+ const float * y_df = (const float *) y_ds;
312
+
313
+ int u[2*VDR_Q5_0_Q8_1_MMQ];
314
+
315
+ #pragma unroll
316
+ for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
317
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
318
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
319
+ }
320
+
321
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
322
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
323
+ }
324
+ }
325
+ }
326
+
327
+
328
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
329
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
330
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
331
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
332
+
333
+ const int kbx = threadIdx.x / QI5_1;
334
+ const int kqsx = threadIdx.x % QI5_1;
335
+
336
+ #pragma unroll
337
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
338
+ int i = i0 + threadIdx.y;
339
+
340
+ if (need_check) {
341
+ i = min(i, i_max);
342
+ }
343
+
344
+ const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbx;
345
+
346
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
347
+ const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_1));
348
+
349
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
350
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
351
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
352
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
353
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
354
+
355
+ x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+0] = qs0;
356
+
357
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
358
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
359
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
360
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
361
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
362
+
363
+ x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+1] = qs1;
364
+ }
365
+
366
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
367
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
368
+
369
+ #pragma unroll
370
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
371
+ int i = i0 + threadIdx.y * QI5_1 + threadIdx.x / blocks_per_tile_x_row;
372
+
373
+ if (need_check) {
374
+ i = min(i, i_max);
375
+ }
376
+
377
+ const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbxd;
378
+
379
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
380
+ }
381
+ }
382
+
383
+ template <int mmq_x, int mmq_y, int nwarps>
384
+ static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
385
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
386
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
387
+
388
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
389
+
390
+ #pragma unroll
391
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
392
+ const int j = j0 + threadIdx.y;
393
+
394
+ #pragma unroll
395
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
396
+ const int i = i0 + threadIdx.x;
397
+
398
+ const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
399
+ const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k0/QI5_1;
400
+
401
+ int u[2*VDR_Q5_1_Q8_1_MMQ];
402
+
403
+ #pragma unroll
404
+ for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
405
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
406
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
407
+ }
408
+
409
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
410
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
411
+ }
412
+ }
413
+ }
414
+
415
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
416
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
417
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
418
+
419
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
420
+
421
+ const int kbx = threadIdx.x / QI8_0;
422
+ const int kqsx = threadIdx.x % QI8_0;
423
+ float * x_dmf = (float *) x_dm;
424
+
425
+ #pragma unroll
426
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
427
+ int i = i0 + threadIdx.y;
428
+
429
+ if (need_check) {
430
+ i = min(i, i_max);
431
+ }
432
+
433
+ const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbx;
434
+
435
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_int8(bxi->qs, kqsx);
436
+ }
437
+
438
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
439
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
440
+
441
+ #pragma unroll
442
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
443
+ int i = i0 + threadIdx.y * QI8_0 + threadIdx.x / blocks_per_tile_x_row;
444
+
445
+ if (need_check) {
446
+ i = min(i, i_max);
447
+ }
448
+
449
+ const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbxd;
450
+
451
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
452
+ }
453
+ }
454
+
455
+ template <int mmq_x, int mmq_y, int nwarps>
456
+ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
457
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
458
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
459
+
460
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
461
+
462
+ #pragma unroll
463
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
464
+ const int j = j0 + threadIdx.y;
465
+
466
+ #pragma unroll
467
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
468
+ const int i = i0 + threadIdx.x;
469
+
470
+ const float * x_dmf = (const float *) x_dm;
471
+ const float * y_df = (const float *) y_ds;
472
+
473
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
474
+ (&x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[j * WARP_SIZE + k0], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
475
+ y_df[j * (WARP_SIZE/QI8_1) + k0/QI8_1]);
476
+ }
477
+ }
478
+ }
479
+
480
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
481
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
482
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
483
+ GGML_UNUSED(x_qh);
484
+
485
+ const int kbx = threadIdx.x / QI2_K;
486
+ const int kqsx = threadIdx.x % QI2_K;
487
+
488
+ #pragma unroll
489
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
490
+ int i = i0 + threadIdx.y;
491
+
492
+ if (need_check) {
493
+ i = min(i, i_max);
494
+ }
495
+
496
+ const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride + kbx;
497
+
498
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8_aligned(bxi->qs, kqsx);
499
+ }
500
+
501
+ const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
502
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
503
+
504
+ #pragma unroll
505
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
506
+ int i = (i0 + threadIdx.y * QI2_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
507
+
508
+ if (need_check) {
509
+ i = min(i, i_max);
510
+ }
511
+
512
+ const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride + kbxd;
513
+
514
+ x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
515
+ }
516
+
517
+ #pragma unroll
518
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
519
+ int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
520
+
521
+ if (need_check) {
522
+ i = min(i, i_max);
523
+ }
524
+
525
+ const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/4)) / (QI2_K/4);
526
+
527
+ x_sc[i * (WARP_SIZE/4) + i / 4 + threadIdx.x % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, threadIdx.x % (QI2_K/4));
528
+ }
529
+ }
530
+
531
+ template <int mmq_x, int mmq_y, int nwarps>
532
+ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
533
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
534
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
535
+
536
+ GGML_UNUSED(x_qh);
537
+
538
+ #pragma unroll
539
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
540
+ const int j = j0 + threadIdx.y;
541
+
542
+ #pragma unroll
543
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
544
+ const int i = i0 + threadIdx.x;
545
+
546
+ const int kbx = k0 / QI2_K;
547
+ const int ky = (k0 % QI2_K) * QR2_K;
548
+ const float * y_df = (const float *) y_ds;
549
+
550
+ int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
551
+
552
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
553
+ const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
554
+
555
+ #pragma unroll
556
+ for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
557
+ v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
558
+ }
559
+
560
+ const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
561
+
562
+ const int index_y = j * WARP_SIZE + (QR2_K*k0) % WARP_SIZE;
563
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq(
564
+ v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
565
+ }
566
+ }
567
+ }
568
+
569
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
570
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
571
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
572
+
573
+ const int kbx = threadIdx.x / QI3_K;
574
+ const int kqsx = threadIdx.x % QI3_K;
575
+
576
+ #pragma unroll
577
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
578
+ int i = i0 + threadIdx.y;
579
+
580
+ if (need_check) {
581
+ i = min(i, i_max);
582
+ }
583
+
584
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + kbx;
585
+
586
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8(bxi->qs, kqsx);
587
+ }
588
+
589
+ const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
590
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
591
+ float * x_dmf = (float *) x_dm;
592
+
593
+ #pragma unroll
594
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
595
+ int i = (i0 + threadIdx.y * QI3_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
596
+
597
+ if (need_check) {
598
+ i = min(i, i_max);
599
+ }
600
+
601
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + kbxd;
602
+
603
+ x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
604
+ }
605
+
606
+ #pragma unroll
607
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
608
+ int i = i0 + threadIdx.y * 2 + threadIdx.x / (WARP_SIZE/2);
609
+
610
+ if (need_check) {
611
+ i = min(i, i_max);
612
+ }
613
+
614
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/2)) / (QI3_K/2);
615
+
616
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
617
+ x_qh[i * (WARP_SIZE/2) + i / 2 + threadIdx.x % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, threadIdx.x % (QI3_K/2));
618
+ }
619
+
620
+ #pragma unroll
621
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
622
+ int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
623
+
624
+ if (need_check) {
625
+ i = min(i, i_max);
626
+ }
627
+
628
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/4)) / (QI3_K/4);
629
+
630
+ const int ksc = threadIdx.x % (QI3_K/4);
631
+
632
+ const int ksc_low = ksc % (QI3_K/8);
633
+ const int shift_low = 4 * (ksc / (QI3_K/8));
634
+ const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
635
+
636
+ const int ksc_high = QI3_K/8;
637
+ const int shift_high = 2 * ksc;
638
+ const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
639
+
640
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
641
+
642
+ x_sc[i * (WARP_SIZE/4) + i / 4 + threadIdx.x % (WARP_SIZE/4)] = sc;
643
+ }
644
+ }
645
+
646
+ template <int mmq_x, int mmq_y, int nwarps>
647
+ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
648
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
649
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
650
+
651
+ #pragma unroll
652
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
653
+ const int j = j0 + threadIdx.y;
654
+
655
+ #pragma unroll
656
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
657
+ const int i = i0 + threadIdx.x;
658
+
659
+ const int kbx = k0 / QI3_K;
660
+ const int ky = (k0 % QI3_K) * QR3_K;
661
+ const float * x_dmf = (const float *) x_dm;
662
+ const float * y_df = (const float *) y_ds;
663
+
664
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
665
+
666
+ int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
667
+
668
+ #pragma unroll
669
+ for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
670
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
671
+ const int shift = 2 * ((ky % 32) / 8);
672
+ const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
673
+
674
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
675
+ const int vlh = (vh << 2) & 0x04040404;
676
+
677
+ v[l] = __vsubss4(vll, vlh);
678
+ }
679
+
680
+ const int index_y = j * WARP_SIZE + (k0*QR3_K) % WARP_SIZE;
681
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq(
682
+ v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
683
+ }
684
+ }
685
+ }
686
+
687
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
688
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
689
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
690
+ GGML_UNUSED(x_qh);
691
+
692
+ const int kbx = 0; // threadIdx.x / QI4_K
693
+ const int kqsx = threadIdx.x; // threadIdx.x % QI4_K
694
+
695
+ #pragma unroll
696
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
697
+ int i = i0 + threadIdx.y;
698
+
699
+ if (need_check) {
700
+ i = min(i, i_max);
701
+ }
702
+
703
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + kbx;
704
+
705
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8_aligned(bxi->qs, kqsx);
706
+ }
707
+
708
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
709
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
710
+
711
+ #pragma unroll
712
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
713
+ int i = (i0 + threadIdx.y * QI4_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
714
+
715
+ if (need_check) {
716
+ i = min(i, i_max);
717
+ }
718
+
719
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + kbxd;
720
+
721
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
722
+ }
723
+
724
+ #pragma unroll
725
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
726
+ int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
727
+
728
+ if (need_check) {
729
+ i = min(i, i_max);
730
+ }
731
+
732
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / (QI4_K/8);
733
+
734
+ const int * scales = (const int *) bxi->scales;
735
+
736
+ const int ksc = threadIdx.x % (WARP_SIZE/8);
737
+
738
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
739
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
740
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
741
+
742
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
743
+ }
744
+ }
745
+
746
+ template <int mmq_x, int mmq_y, int nwarps>
747
+ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
748
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
749
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
750
+
751
+ GGML_UNUSED(x_qh);
752
+
753
+ #pragma unroll
754
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
755
+ const int j = j0 + threadIdx.y;
756
+
757
+ #pragma unroll
758
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
759
+ const int i = i0 + threadIdx.x;
760
+
761
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2*((k0 % 16) / 8);
762
+
763
+ const int index_y = j * WARP_SIZE + (QR4_K*k0) % WARP_SIZE;
764
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
765
+ &x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
766
+ }
767
+ }
768
+ }
769
+
770
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
771
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
772
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
773
+ GGML_UNUSED(x_qh);
774
+
775
+ const int kbx = 0; // threadIdx.x / QI5_K
776
+ const int kqsx = threadIdx.x; // threadIdx.x % QI5_K
777
+
778
+ #pragma unroll
779
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
780
+ int i = i0 + threadIdx.y;
781
+
782
+ if (need_check) {
783
+ i = min(i, i_max);
784
+ }
785
+
786
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + kbx;
787
+ const int ky = QR5_K*kqsx;
788
+
789
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
790
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
791
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
792
+
793
+ const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
794
+ const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
795
+ const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
796
+
797
+ const int kq0 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + 0;
798
+ const int kq1 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + (QI5_K/4);
799
+
800
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
801
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
802
+ }
803
+
804
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
805
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
806
+
807
+ #pragma unroll
808
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
809
+ int i = (i0 + threadIdx.y * QI5_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
810
+
811
+ if (need_check) {
812
+ i = min(i, i_max);
813
+ }
814
+
815
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + kbxd;
816
+
817
+ x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
818
+ }
819
+
820
+ #pragma unroll
821
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
822
+ int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
823
+
824
+ if (need_check) {
825
+ i = min(i, i_max);
826
+ }
827
+
828
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / (QI5_K/8);
829
+
830
+ const int * scales = (const int *) bxi->scales;
831
+
832
+ const int ksc = threadIdx.x % (WARP_SIZE/8);
833
+
834
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
835
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
836
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
837
+
838
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
839
+ }
840
+ }
841
+
842
+ template <int mmq_x, int mmq_y, int nwarps>
843
+ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
844
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
845
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
846
+
847
+ GGML_UNUSED(x_qh);
848
+
849
+ #pragma unroll
850
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
851
+ const int j = j0 + threadIdx.y;
852
+
853
+ #pragma unroll
854
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
855
+ const int i = i0 + threadIdx.x;
856
+
857
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
858
+
859
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k0;
860
+ const int index_y = j * WARP_SIZE + (QR5_K*k0) % WARP_SIZE;
861
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
862
+ &x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
863
+ }
864
+ }
865
+ }
866
+
867
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
868
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
869
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
870
+ GGML_UNUSED(x_qh);
871
+
872
+ const int kbx = 0; // threadIdx.x / QI6_K
873
+ const int kqsx = threadIdx.x; // threadIdx.x % QI6_K
874
+
875
+ #pragma unroll
876
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
877
+ int i = i0 + threadIdx.y;
878
+
879
+ if (need_check) {
880
+ i = min(i, i_max);
881
+ }
882
+
883
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + kbx;
884
+ const int ky = QR6_K*kqsx;
885
+
886
+ const int ql = get_int_from_uint8(bxi->ql, kqsx);
887
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
888
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
889
+
890
+ const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
891
+ const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
892
+ const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
893
+
894
+ const int kq0 = ky - ky % QI6_K + threadIdx.x % (QI6_K/2) + 0;
895
+ const int kq1 = ky - ky % QI6_K + threadIdx.x % (QI6_K/2) + (QI6_K/2);
896
+
897
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
898
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
899
+ }
900
+
901
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
902
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
903
+ float * x_dmf = (float *) x_dm;
904
+
905
+ #pragma unroll
906
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
907
+ int i = (i0 + threadIdx.y * QI6_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
908
+
909
+ if (need_check) {
910
+ i = min(i, i_max);
911
+ }
912
+
913
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + kbxd;
914
+
915
+ x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
916
+ }
917
+
918
+ #pragma unroll
919
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
920
+ int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
921
+
922
+ if (need_check) {
923
+ i = min(i, i_max);
924
+ }
925
+
926
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / 4;
927
+
928
+ x_sc[i * (WARP_SIZE/8) + i / 8 + threadIdx.x % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, threadIdx.x % (QI6_K/8));
929
+ }
930
+ }
931
+
932
+ template <int mmq_x, int mmq_y, int nwarps>
933
+ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
934
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
935
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
936
+
937
+ GGML_UNUSED(x_qh);
938
+
939
+ #pragma unroll
940
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
941
+ const int j = j0 + threadIdx.y;
942
+
943
+ #pragma unroll
944
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
945
+ const int i = i0 + threadIdx.x;
946
+
947
+ const float * x_dmf = (const float *) x_dm;
948
+ const float * y_df = (const float *) y_ds;
949
+
950
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
951
+
952
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k0;
953
+ const int index_y = j * WARP_SIZE + (QR6_K*k0) % WARP_SIZE;
954
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq(
955
+ &x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
956
+ }
957
+ }
958
+ }
959
+
960
+ // -------------------------------------------------------------------------------------------------------------------------------------
961
+
962
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check, ggml_type type>
963
+ struct mmq_type_traits;
964
+
965
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
966
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
967
+ static constexpr bool need_sum = true;
968
+ static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ;
969
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, nwarps, need_check>;
970
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
971
+ };
972
+
973
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
974
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
975
+ static constexpr bool need_sum = true;
976
+ static constexpr int vdr = VDR_Q4_1_Q8_1_MMQ;
977
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, nwarps, need_check>;
978
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
979
+ };
980
+
981
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
982
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
983
+ static constexpr bool need_sum = false;
984
+ static constexpr int vdr = VDR_Q5_0_Q8_1_MMQ;
985
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, nwarps, need_check>;
986
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
987
+ };
988
+
989
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
990
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
991
+ static constexpr bool need_sum = true;
992
+ static constexpr int vdr = VDR_Q5_1_Q8_1_MMQ;
993
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, nwarps, need_check>;
994
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
995
+ };
996
+
997
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
998
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
999
+ static constexpr bool need_sum = false;
1000
+ static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ;
1001
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, nwarps, need_check>;
1002
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q8_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
1003
+ };
1004
+
1005
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
1006
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
1007
+ static constexpr bool need_sum = false;
1008
+ static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ;
1009
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, nwarps, need_check>;
1010
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q2_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
1011
+ };
1012
+
1013
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
1014
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
1015
+ static constexpr bool need_sum = false;
1016
+ static constexpr int vdr = VDR_Q3_K_Q8_1_MMQ;
1017
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, nwarps, need_check>;
1018
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q3_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
1019
+ };
1020
+
1021
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
1022
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
1023
+ static constexpr bool need_sum = true;
1024
+ static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ;
1025
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
1026
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
1027
+ };
1028
+
1029
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
1030
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
1031
+ static constexpr bool need_sum = true;
1032
+ static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ;
1033
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
1034
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
1035
+ };
1036
+
1037
+ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
1038
+ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
1039
+ static constexpr bool need_sum = false;
1040
+ static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ;
1041
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
1042
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
1043
+ };
1044
+
1045
+ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
1046
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1047
+ #if defined(RDNA3) || defined(RDNA2)
1048
+ __launch_bounds__(WARP_SIZE*nwarps, 2)
1049
+ #endif // defined(RDNA3) || defined(RDNA2)
1050
+ #else
1051
+ #if __CUDA_ARCH__ >= CC_VOLTA
1052
+ __launch_bounds__(WARP_SIZE*nwarps, 1)
1053
+ #else
1054
+ __launch_bounds__(WARP_SIZE*nwarps, type == GGML_TYPE_Q2_K ? 1 : 2)
1055
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
1056
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1057
+ static __global__ void mul_mat_q(
1058
+ const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst,
1059
+ const int ne00, const int ne01, const int stride00, const int ne10, const int ne11, const int ne0) {
1060
+
1061
+ // Skip unused template specializations for faster compilation:
1062
+ if (mmq_x > get_mmq_x_max_device()) {
1063
+ NO_DEVICE_CODE;
1064
+ return;
1065
+ }
1066
+
1067
+ constexpr int qk = ggml_cuda_type_traits<type>::qk;
1068
+ constexpr int qr = ggml_cuda_type_traits<type>::qr;
1069
+ constexpr int qi = ggml_cuda_type_traits<type>::qi;
1070
+ constexpr int mmq_y = get_mmq_y_device(mmq_x);
1071
+ constexpr bool need_sum = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::need_sum;
1072
+ constexpr int vdr = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr;
1073
+ constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
1074
+ constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot;
1075
+
1076
+ constexpr tile_x_sizes txs = get_tile_x_sizes_device<mmq_y>(type);
1077
+
1078
+ extern __shared__ char data_mul_mat_q[];
1079
+ int * tile_x_ql = (int *) data_mul_mat_q;
1080
+ half2 * tile_x_dm = (half2 *) (tile_x_ql + txs.ql);
1081
+ int * tile_x_qh = (int *) (tile_x_dm + txs.dm);
1082
+ int * tile_x_sc = (int *) (tile_x_qh + txs.qh);
1083
+ int * tile_y_qs = (int *) (tile_x_sc + txs.sc); // [mmq_x * WARP_SIZE]
1084
+ half2 * tile_y_ds = (half2 *) (tile_y_qs + mmq_x*WARP_SIZE); // [mmq_x * WARP_SIZE/QI8_1];
1085
+
1086
+ const block_q8_1 * y = (const block_q8_1 *) yc;
1087
+
1088
+ const int blocks_per_row_x = ne00 / qk;
1089
+ const int blocks_per_col_y = ne10 / QK8_1;
1090
+ const int blocks_per_warp = WARP_SIZE / qi;
1091
+
1092
+ const int & ne1 = ne11;
1093
+
1094
+ const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1;
1095
+
1096
+ float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f};
1097
+
1098
+ for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
1099
+
1100
+ load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride00*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride00);
1101
+
1102
+ #pragma unroll
1103
+ for (int kr = 0; kr < qr; ++kr) {
1104
+ const int kqs = kr*WARP_SIZE + threadIdx.x;
1105
+ const int kbxd = kqs / QI8_1;
1106
+
1107
+ #pragma unroll
1108
+ for (int i0 = 0; i0 < mmq_x; i0 += nwarps) {
1109
+ const int i = min(blockIdx.y*mmq_x + threadIdx.y + i0, ne11-1); // to prevent out-of-bounds memory accesses
1110
+
1111
+ const block_q8_1 * by0 = &y[i*blocks_per_col_y + kb0 * (qk/QK8_1) + kbxd];
1112
+
1113
+ const int index_y = (i0 + threadIdx.y) * WARP_SIZE + kqs % WARP_SIZE;
1114
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
1115
+ }
1116
+
1117
+ #pragma unroll
1118
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
1119
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
1120
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
1121
+ const int i_y_eff = min(blockIdx.y*mmq_x + ids, ne11-1);
1122
+
1123
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
1124
+ const half2 * dsi_src = &y[i_y_eff*blocks_per_col_y + kb0 * (qk/QK8_1) + kr*(WARP_SIZE/QI8_1) + kby].ds;
1125
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
1126
+ if (need_sum) {
1127
+ *dsi_dst = *dsi_src;
1128
+ } else {
1129
+ float * dfi_dst = (float *) dsi_dst;
1130
+ *dfi_dst = __low2float(*dsi_src);
1131
+ }
1132
+ }
1133
+
1134
+ __syncthreads();
1135
+
1136
+ // #pragma unroll // unrolling this loop causes too much register pressure
1137
+ for (int k0 = kr*WARP_SIZE/qr; k0 < (kr+1)*WARP_SIZE/qr; k0 += vdr) {
1138
+ vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, sum, k0);
1139
+ }
1140
+
1141
+ __syncthreads();
1142
+ }
1143
+ }
1144
+
1145
+ #pragma unroll
1146
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1147
+ const int j = blockIdx.y*mmq_x + j0 + threadIdx.y;
1148
+
1149
+ if (j >= ne1) {
1150
+ return;
1151
+ }
1152
+
1153
+ #pragma unroll
1154
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
1155
+ const int i = blockIdx.x*mmq_y + i0 + threadIdx.x;
1156
+
1157
+ if (need_check && i >= ne0) {
1158
+ continue;
1159
+ }
1160
+
1161
+ dst[j*ne0 + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
1162
+ }
1163
+ }
1164
+ }
1165
+
1166
+ struct mmq_args {
1167
+ const char * x; const char * y; float * dst;
1168
+ int64_t ne00; int64_t ne01; int64_t stride00;
1169
+ int64_t ne10; int64_t ne11;
1170
+ int64_t ne0;
1171
+ };
1172
+
1173
+ template <ggml_type type, int mmq_x, int nwarps>
1174
+ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
1175
+ const int id = ggml_cuda_get_device();
1176
+ const int cc = ggml_cuda_info().devices[id].cc;
1177
+ const int mmq_y = get_mmq_y_host(cc, mmq_x);
1178
+
1179
+ const int block_num_x = (args.ne01 + mmq_y - 1) / mmq_y;
1180
+ const int block_num_y = (args.ne11 + mmq_x - 1) / mmq_x;
1181
+ const dim3 block_nums(block_num_x, block_num_y, 1);
1182
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
1183
+
1184
+ const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y);
1185
+ const int shmem_x = txs.ql*sizeof(int) + txs.dm*sizeof(half2) + txs.qh*sizeof(int) + txs.sc*sizeof(int);
1186
+ const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2);
1187
+ const int shmem = shmem_x + shmem_y;
1188
+
1189
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
1190
+ static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
1191
+ if (!shmem_limit_raised[id]) {
1192
+ CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, nwarps, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
1193
+ CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, nwarps, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
1194
+ shmem_limit_raised[id] = true;
1195
+ }
1196
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
1197
+
1198
+ if (args.ne01 % mmq_y == 0) {
1199
+ const bool need_check = false;
1200
+ mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
1201
+ (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
1202
+ } else {
1203
+ const bool need_check = true;
1204
+ mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
1205
+ (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
1206
+ }
1207
+ }
1208
+
1209
+ template <ggml_type type>
1210
+ void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) {
1211
+ const int id = ggml_cuda_get_device();
1212
+ const int nsm = ggml_cuda_info().devices[id].nsm;
1213
+ const int cc = ggml_cuda_info().devices[id].cc;
1214
+
1215
+ const int mmq_x_max = get_mmq_x_max_host(cc);
1216
+ const int mmq_y = get_mmq_y_host(cc, mmq_x_max);
1217
+ const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
1218
+
1219
+ int mmq_x_best = 0;
1220
+ int nwaves_best = INT_MAX;
1221
+
1222
+ for (int mmq_x = 8; mmq_x <= mmq_x_max && nwaves_best > 1; mmq_x += 8) {
1223
+ const int block_num_x = (args.ne11 + mmq_x - 1) / mmq_x;
1224
+ const int nwaves = (block_num_x*block_num_y + nsm - 1) / nsm;
1225
+
1226
+ if (nwaves < nwaves_best) {
1227
+ mmq_x_best = mmq_x;
1228
+ nwaves_best = nwaves;
1229
+ }
1230
+ }
1231
+
1232
+ switch (mmq_x_best) {
1233
+ case 8:
1234
+ launch_mul_mat_q<type, 8, 4>(args, stream);
1235
+ break;
1236
+ case 16:
1237
+ launch_mul_mat_q<type, 16, 8>(args, stream);
1238
+ break;
1239
+ case 24:
1240
+ launch_mul_mat_q<type, 24, 8>(args, stream);
1241
+ break;
1242
+ case 32:
1243
+ launch_mul_mat_q<type, 32, 8>(args, stream);
1244
+ break;
1245
+ case 40:
1246
+ launch_mul_mat_q<type, 40, 8>(args, stream);
1247
+ break;
1248
+ case 48:
1249
+ launch_mul_mat_q<type, 48, 8>(args, stream);
1250
+ break;
1251
+ case 56:
1252
+ launch_mul_mat_q<type, 56, 8>(args, stream);
1253
+ break;
1254
+ case 64:
1255
+ launch_mul_mat_q<type, 64, 8>(args, stream);
1256
+ break;
1257
+ case 72:
1258
+ launch_mul_mat_q<type, 72, 8>(args, stream);
1259
+ break;
1260
+ case 80:
1261
+ launch_mul_mat_q<type, 80, 8>(args, stream);
1262
+ break;
1263
+ case 88:
1264
+ launch_mul_mat_q<type, 88, 8>(args, stream);
1265
+ break;
1266
+ case 96:
1267
+ launch_mul_mat_q<type, 96, 8>(args, stream);
1268
+ break;
1269
+ case 104:
1270
+ launch_mul_mat_q<type, 104, 8>(args, stream);
1271
+ break;
1272
+ case 112:
1273
+ launch_mul_mat_q<type, 112, 8>(args, stream);
1274
+ break;
1275
+ case 120:
1276
+ launch_mul_mat_q<type, 120, 8>(args, stream);
1277
+ break;
1278
+ case 128:
1279
+ launch_mul_mat_q<type, 128, 8>(args, stream);
1280
+ break;
1281
+ default:
1282
+ GGML_ASSERT(false);
1283
+ break;
1284
+ }
1285
+ }
1286
+
1287
+ #define DECL_MMQ_CASE(type) \
1288
+ template void mul_mat_q_case<type>(const mmq_args & args, cudaStream_t stream) \
1289
+
1290
+ extern DECL_MMQ_CASE(GGML_TYPE_Q4_0);
1291
+ extern DECL_MMQ_CASE(GGML_TYPE_Q4_1);
1292
+ extern DECL_MMQ_CASE(GGML_TYPE_Q5_0);
1293
+ extern DECL_MMQ_CASE(GGML_TYPE_Q5_1);
1294
+ extern DECL_MMQ_CASE(GGML_TYPE_Q8_0);
1295
+ extern DECL_MMQ_CASE(GGML_TYPE_Q2_K);
1296
+ extern DECL_MMQ_CASE(GGML_TYPE_Q3_K);
1297
+ extern DECL_MMQ_CASE(GGML_TYPE_Q4_K);
1298
+ extern DECL_MMQ_CASE(GGML_TYPE_Q5_K);
1299
+ extern DECL_MMQ_CASE(GGML_TYPE_Q6_K);
1300
+
1301
+ // -------------------------------------------------------------------------------------------------------------------------
1302
 
1303
  void ggml_cuda_op_mul_mat_q(
1304
  ggml_backend_cuda_context & ctx,
ggml-cuda/mmvq.cu CHANGED
@@ -1,9 +1,47 @@
1
  #include "mmvq.cuh"
2
  #include "vecdotq.cuh"
3
 
4
- typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
7
  #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
8
  // tell the compiler to use as many registers as it wants, see nwarps definition below
9
  __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
@@ -12,6 +50,12 @@ static __global__ void mul_mat_vec_q(
12
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
13
  const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
14
 
 
 
 
 
 
 
15
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
16
  constexpr int nwarps = 1;
17
  constexpr int rows_per_cuda_block = 1;
@@ -29,7 +73,6 @@ static __global__ void mul_mat_vec_q(
29
  // partial sum for each thread
30
  float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
31
 
32
- const block_q_t * x = (const block_q_t *) vx;
33
  const block_q8_1 * y = (const block_q8_1 *) vy;
34
 
35
  for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
@@ -42,8 +85,7 @@ static __global__ void mul_mat_vec_q(
42
  for (int j = 0; j < ncols_y; ++j) {
43
  #pragma unroll
44
  for (int i = 0; i < rows_per_cuda_block; ++i) {
45
- tmp[j][i] += vec_dot_q_cuda(
46
- &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
47
  }
48
  }
49
  }
@@ -81,12 +123,12 @@ static __global__ void mul_mat_vec_q(
81
  }
82
  }
83
 
84
- template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
85
  static void mul_mat_vec_q_cuda(
86
  const void * vx, const void * vy, float * dst,
87
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
88
 
89
- GGML_ASSERT(ncols_x % qk == 0);
90
  GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
91
 
92
  int id = ggml_cuda_get_device();
@@ -124,36 +166,28 @@ static void mul_mat_vec_q_cuda(
124
 
125
  switch (ncols_y) {
126
  case 1:
127
- mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
128
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
129
  break;
130
  case 2:
131
- mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
132
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
133
  break;
134
  case 3:
135
- mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
136
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
137
  break;
138
  case 4:
139
- mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
140
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
141
  break;
142
  case 5:
143
- mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
144
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
145
  break;
146
  case 6:
147
- mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
148
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
149
  break;
150
  case 7:
151
- mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
152
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
153
  break;
154
  case 8:
155
- mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
156
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
157
  break;
158
  default:
159
  GGML_ASSERT(false);
@@ -165,152 +199,133 @@ static void mul_mat_vec_q4_0_q8_1_cuda(
165
  const void * vx, const void * vy, float * dst,
166
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
167
 
168
- mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
169
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
170
  }
171
 
172
  static void mul_mat_vec_q4_1_q8_1_cuda(
173
  const void * vx, const void * vy, float * dst,
174
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
175
 
176
- mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
177
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
178
  }
179
 
180
  static void mul_mat_vec_q5_0_q8_1_cuda(
181
  const void * vx, const void * vy, float * dst,
182
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
183
 
184
- mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
185
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
186
  }
187
 
188
  static void mul_mat_vec_q5_1_q8_1_cuda(
189
  const void * vx, const void * vy, float * dst,
190
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
191
 
192
- mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
193
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
194
  }
195
 
196
  static void mul_mat_vec_q8_0_q8_1_cuda(
197
  const void * vx, const void * vy, float * dst,
198
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
199
 
200
- mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
201
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
202
  }
203
 
204
  static void mul_mat_vec_q2_K_q8_1_cuda(
205
  const void * vx, const void * vy, float * dst,
206
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
207
 
208
- mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
209
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
210
  }
211
 
212
  static void mul_mat_vec_q3_K_q8_1_cuda(
213
  const void * vx, const void * vy, float * dst,
214
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
215
 
216
- mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
217
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
218
  }
219
 
220
  static void mul_mat_vec_q4_K_q8_1_cuda(
221
  const void * vx, const void * vy, float * dst,
222
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
223
 
224
- mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
225
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
226
  }
227
 
228
  static void mul_mat_vec_q5_K_q8_1_cuda(
229
  const void * vx, const void * vy, float * dst,
230
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
231
 
232
- mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
233
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
234
  }
235
 
236
  static void mul_mat_vec_q6_K_q8_1_cuda(
237
  const void * vx, const void * vy, float * dst,
238
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
239
 
240
- mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
241
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
242
  }
243
 
244
  static void mul_mat_vec_iq2_xxs_q8_1_cuda(
245
  const void * vx, const void * vy, float * dst,
246
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
247
 
248
- mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
249
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
250
  }
251
 
252
  static void mul_mat_vec_iq2_xs_q8_1_cuda(
253
  const void * vx, const void * vy, float * dst,
254
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
255
 
256
- mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
257
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
258
  }
259
 
260
  static void mul_mat_vec_iq2_s_q8_1_cuda(
261
  const void * vx, const void * vy, float * dst,
262
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
263
 
264
- mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
265
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
266
  }
267
 
268
  static void mul_mat_vec_iq3_xxs_q8_1_cuda(
269
  const void * vx, const void * vy, float * dst,
270
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
271
 
272
- mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
273
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
274
  }
275
 
276
  static void mul_mat_vec_iq1_s_q8_1_cuda(
277
  const void * vx, const void * vy, float * dst,
278
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
279
 
280
- mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
281
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
282
  }
283
 
284
  static void mul_mat_vec_iq1_m_q8_1_cuda(
285
  const void * vx, const void * vy, float * dst,
286
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
287
 
288
- mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
289
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
290
  }
291
 
292
  static void mul_mat_vec_iq4_nl_q8_1_cuda(
293
  const void * vx, const void * vy, float * dst,
294
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
295
 
296
- mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
297
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
298
  }
299
 
300
  static void mul_mat_vec_iq4_xs_q8_1_cuda(
301
  const void * vx, const void * vy, float * dst,
302
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
303
 
304
- mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
305
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
306
  }
307
 
308
  static void mul_mat_vec_iq3_s_q8_1_cuda(
309
  const void * vx, const void * vy, float * dst,
310
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
311
 
312
- mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
313
- (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
314
  }
315
 
316
  void ggml_cuda_op_mul_mat_vec_q(
 
1
  #include "mmvq.cuh"
2
  #include "vecdotq.cuh"
3
 
4
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
5
+
6
+ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
7
+ return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
8
+ type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
9
+ type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
10
+ type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
11
+ type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
12
+ type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
13
+ type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
14
+ type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
15
+ type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
16
+ type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
17
+ type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
18
+ type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
19
+ type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
20
+ type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
21
+ type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
22
+ type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
23
+ type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
24
+ type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
25
+ type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
26
+ nullptr;
27
+ }
28
+
29
+ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
30
+ return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
31
+ type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
32
+ type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
33
+ type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
34
+ type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
35
+ type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
36
+ type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
37
+ type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
38
+ type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
39
+ type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
40
+ type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
41
+ 1;
42
+ }
43
 
44
+ template <ggml_type type, int ncols_y>
45
  #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
46
  // tell the compiler to use as many registers as it wants, see nwarps definition below
47
  __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
 
50
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
51
  const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
52
 
53
+ constexpr int qk = ggml_cuda_type_traits<type>::qk;
54
+ constexpr int qi = ggml_cuda_type_traits<type>::qi;
55
+ constexpr int vdr = get_vdr_mmvq(type);
56
+
57
+ constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
58
+
59
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
60
  constexpr int nwarps = 1;
61
  constexpr int rows_per_cuda_block = 1;
 
73
  // partial sum for each thread
74
  float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
75
 
 
76
  const block_q8_1 * y = (const block_q8_1 *) vy;
77
 
78
  for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
 
85
  for (int j = 0; j < ncols_y; ++j) {
86
  #pragma unroll
87
  for (int i = 0; i < rows_per_cuda_block; ++i) {
88
+ tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
 
89
  }
90
  }
91
  }
 
123
  }
124
  }
125
 
126
+ template <ggml_type type>
127
  static void mul_mat_vec_q_cuda(
128
  const void * vx, const void * vy, float * dst,
129
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
130
 
131
+ GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
132
  GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
133
 
134
  int id = ggml_cuda_get_device();
 
166
 
167
  switch (ncols_y) {
168
  case 1:
169
+ mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 
170
  break;
171
  case 2:
172
+ mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 
173
  break;
174
  case 3:
175
+ mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 
176
  break;
177
  case 4:
178
+ mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 
179
  break;
180
  case 5:
181
+ mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 
182
  break;
183
  case 6:
184
+ mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 
185
  break;
186
  case 7:
187
+ mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 
188
  break;
189
  case 8:
190
+ mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
 
191
  break;
192
  default:
193
  GGML_ASSERT(false);
 
199
  const void * vx, const void * vy, float * dst,
200
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
201
 
202
+ mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
203
  }
204
 
205
  static void mul_mat_vec_q4_1_q8_1_cuda(
206
  const void * vx, const void * vy, float * dst,
207
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
208
 
209
+ mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
210
  }
211
 
212
  static void mul_mat_vec_q5_0_q8_1_cuda(
213
  const void * vx, const void * vy, float * dst,
214
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
215
 
216
+ mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
217
  }
218
 
219
  static void mul_mat_vec_q5_1_q8_1_cuda(
220
  const void * vx, const void * vy, float * dst,
221
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
222
 
223
+ mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
224
  }
225
 
226
  static void mul_mat_vec_q8_0_q8_1_cuda(
227
  const void * vx, const void * vy, float * dst,
228
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
229
 
230
+ mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
231
  }
232
 
233
  static void mul_mat_vec_q2_K_q8_1_cuda(
234
  const void * vx, const void * vy, float * dst,
235
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
236
 
237
+ mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
238
  }
239
 
240
  static void mul_mat_vec_q3_K_q8_1_cuda(
241
  const void * vx, const void * vy, float * dst,
242
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
243
 
244
+ mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
245
  }
246
 
247
  static void mul_mat_vec_q4_K_q8_1_cuda(
248
  const void * vx, const void * vy, float * dst,
249
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
250
 
251
+ mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
252
  }
253
 
254
  static void mul_mat_vec_q5_K_q8_1_cuda(
255
  const void * vx, const void * vy, float * dst,
256
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
257
 
258
+ mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
259
  }
260
 
261
  static void mul_mat_vec_q6_K_q8_1_cuda(
262
  const void * vx, const void * vy, float * dst,
263
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
264
 
265
+ mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
266
  }
267
 
268
  static void mul_mat_vec_iq2_xxs_q8_1_cuda(
269
  const void * vx, const void * vy, float * dst,
270
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
271
 
272
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
273
  }
274
 
275
  static void mul_mat_vec_iq2_xs_q8_1_cuda(
276
  const void * vx, const void * vy, float * dst,
277
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
278
 
279
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
280
  }
281
 
282
  static void mul_mat_vec_iq2_s_q8_1_cuda(
283
  const void * vx, const void * vy, float * dst,
284
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
285
 
286
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
287
  }
288
 
289
  static void mul_mat_vec_iq3_xxs_q8_1_cuda(
290
  const void * vx, const void * vy, float * dst,
291
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
292
 
293
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
294
  }
295
 
296
  static void mul_mat_vec_iq1_s_q8_1_cuda(
297
  const void * vx, const void * vy, float * dst,
298
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
299
 
300
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
301
  }
302
 
303
  static void mul_mat_vec_iq1_m_q8_1_cuda(
304
  const void * vx, const void * vy, float * dst,
305
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
306
 
307
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
308
  }
309
 
310
  static void mul_mat_vec_iq4_nl_q8_1_cuda(
311
  const void * vx, const void * vy, float * dst,
312
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
313
 
314
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
315
  }
316
 
317
  static void mul_mat_vec_iq4_xs_q8_1_cuda(
318
  const void * vx, const void * vy, float * dst,
319
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
320
 
321
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
322
  }
323
 
324
  static void mul_mat_vec_iq3_s_q8_1_cuda(
325
  const void * vx, const void * vy, float * dst,
326
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
327
 
328
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 
329
  }
330
 
331
  void ggml_cuda_op_mul_mat_vec_q(
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu CHANGED
@@ -1,4 +1,4 @@
1
- // This file has been autogenerated by generate-variants.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4
 
 
1
+ // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
 
3
  #include "../fattn-vec-f16.cuh"
4