JohannesGaessler commited on
Commit
afa1447
·
1 Parent(s): 06acee2

CUDA: refactor and optimize IQ MMVQ (llama/8215)

Browse files

* CUDA: refactor and optimize IQ MMVQ

* uint -> uint32_t

* __dp4a -> ggml_cuda_dp4a

* remove MIN_CC_DP4A checks

* change default

* try CI fix

ggml/src/ggml-common.h CHANGED
@@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
106
  #define QR6_K 2
107
 
108
  #define QI2_XXS (QK_K / (4*QR2_XXS))
109
- #define QR2_XXS 8
110
 
111
  #define QI2_XS (QK_K / (4*QR2_XS))
112
- #define QR2_XS 8
113
 
114
  #define QI2_S (QK_K / (4*QR2_S))
115
- #define QR2_S 8
116
 
117
  #define QI3_XXS (QK_K / (4*QR3_XXS))
118
- #define QR3_XXS 8
119
 
120
  #define QI3_XS (QK_K / (4*QR3_XS))
121
- #define QR3_XS 8
122
 
123
  #define QI1_S (QK_K / (4*QR1_S))
124
  #define QR1_S 8
@@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
130
  #define QR4_NL 2
131
 
132
  #define QI4_XS (QK_K / (4*QR4_XS))
133
- #define QR4_XS 8
134
 
135
  #define QI3_S (QK_K / (4*QR3_S))
136
- #define QR3_S 8
137
 
138
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
139
 
 
106
  #define QR6_K 2
107
 
108
  #define QI2_XXS (QK_K / (4*QR2_XXS))
109
+ #define QR2_XXS 4
110
 
111
  #define QI2_XS (QK_K / (4*QR2_XS))
112
+ #define QR2_XS 4
113
 
114
  #define QI2_S (QK_K / (4*QR2_S))
115
+ #define QR2_S 4
116
 
117
  #define QI3_XXS (QK_K / (4*QR3_XXS))
118
+ #define QR3_XXS 4
119
 
120
  #define QI3_XS (QK_K / (4*QR3_XS))
121
+ #define QR3_XS 4
122
 
123
  #define QI1_S (QK_K / (4*QR1_S))
124
  #define QR1_S 8
 
130
  #define QR4_NL 2
131
 
132
  #define QI4_XS (QK_K / (4*QR4_XS))
133
+ #define QR4_XS 2
134
 
135
  #define QI3_S (QK_K / (4*QR3_S))
136
+ #define QR3_S 4
137
 
138
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
139
 
ggml/src/ggml-cuda.cu CHANGED
@@ -1883,6 +1883,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1883
  bool use_mul_mat_q = ggml_is_quantized(src0->type)
1884
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1885
 
 
 
 
 
 
1886
  bool any_gpus_with_slow_fp16 = false;
1887
 
1888
  if (split) {
@@ -1895,22 +1900,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1895
  }
1896
 
1897
  const int cc = ggml_cuda_info().devices[id].cc;
1898
- use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
1899
  use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1900
  any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
1901
  }
1902
  } else {
1903
  const int cc = ggml_cuda_info().devices[ctx.device].cc;
1904
- use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
1905
  use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1906
  any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
1907
  }
1908
 
1909
- // if mmvq is available it's a better choice than dmmv:
1910
- #ifndef GGML_CUDA_FORCE_DMMV
1911
- use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
1912
- #endif // GGML_CUDA_FORCE_DMMV
1913
-
1914
  // debug helpers
1915
  //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
1916
  //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
 
1883
  bool use_mul_mat_q = ggml_is_quantized(src0->type)
1884
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1885
 
1886
+ // if mmvq is available it's a better choice than dmmv:
1887
+ #ifndef GGML_CUDA_FORCE_DMMV
1888
+ use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
1889
+ #endif // GGML_CUDA_FORCE_DMMV
1890
+
1891
  bool any_gpus_with_slow_fp16 = false;
1892
 
1893
  if (split) {
 
1900
  }
1901
 
1902
  const int cc = ggml_cuda_info().devices[id].cc;
 
1903
  use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1904
  any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
1905
  }
1906
  } else {
1907
  const int cc = ggml_cuda_info().devices[ctx.device].cc;
 
1908
  use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1909
  any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
1910
  }
1911
 
 
 
 
 
 
1912
  // debug helpers
1913
  //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
1914
  //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -3,6 +3,7 @@
3
  #include "ggml.h"
4
  #include "ggml-cuda.h"
5
 
 
6
  #include <memory>
7
 
8
  #if defined(GGML_USE_HIPBLAS)
@@ -268,30 +269,15 @@ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigne
268
  return c;
269
  }
270
 
271
- static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
272
- #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
273
- c = __builtin_amdgcn_sdot4(a, b, c, false);
274
- #elif defined(RDNA3)
275
- c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
276
- #elif defined(__gfx1010__) || defined(__gfx900__)
277
- int tmp1;
278
- int tmp2;
279
- asm("\n \
280
- v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
281
- v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
282
- v_add3_u32 %0, %1, %2, %0 \n \
283
- v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
284
- v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
285
- v_add3_u32 %0, %1, %2, %0 \n \
286
- "
287
- : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
288
- : "v"(a), "v"(b)
289
- );
290
- #else
291
- const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
292
- const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
293
- c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
294
- #endif
295
  return c;
296
  }
297
 
@@ -467,8 +453,48 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
467
  }
468
  #endif // CUDART_VERSION < 12000
469
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  // TODO: move to ggml-common.h
471
- static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
472
 
473
  typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
474
 
 
3
  #include "ggml.h"
4
  #include "ggml-cuda.h"
5
 
6
+ #include <cstdint>
7
  #include <memory>
8
 
9
  #if defined(GGML_USE_HIPBLAS)
 
269
  return c;
270
  }
271
 
272
+ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
273
+ const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
274
+ const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
275
+ unsigned int c;
276
+ uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
277
+ #pragma unroll
278
+ for (int i = 0; i < 4; ++i) {
279
+ vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
280
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  return c;
282
  }
283
 
 
453
  }
454
  #endif // CUDART_VERSION < 12000
455
 
456
+ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
457
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
458
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
459
+ c = __builtin_amdgcn_sdot4(a, b, c, false);
460
+ #elif defined(RDNA3)
461
+ c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
462
+ #elif defined(__gfx1010__) || defined(__gfx900__)
463
+ int tmp1;
464
+ int tmp2;
465
+ asm("\n \
466
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
467
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
468
+ v_add3_u32 %0, %1, %2, %0 \n \
469
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
470
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
471
+ v_add3_u32 %0, %1, %2, %0 \n \
472
+ "
473
+ : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
474
+ : "v"(a), "v"(b)
475
+ );
476
+ #else
477
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
478
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
479
+ c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
480
+ #endif
481
+ return c;
482
+
483
+ #else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
484
+
485
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
486
+ return __dp4a(a, b, c);
487
+ #else // __CUDA_ARCH__ >= MIN_CC_DP4A
488
+ const int8_t * a8 = (const int8_t *) &a;
489
+ const int8_t * b8 = (const int8_t *) &b;
490
+ return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
491
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
492
+
493
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
494
+ }
495
+
496
  // TODO: move to ggml-common.h
497
+ static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
498
 
499
  typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
500
 
ggml/src/ggml-cuda/fattn-common.cuh CHANGED
@@ -54,12 +54,11 @@ typedef float (*vec_dot_KQ_f32_t)(
54
  template<typename T, int D>
55
  static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
56
  const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
57
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
58
 
59
  const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
60
  GGML_UNUSED(Q_v);
61
 
62
- half sum = 0.0f;
63
 
64
  #pragma unroll
65
  for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
@@ -72,7 +71,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
72
  const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
73
  const int u = Q_q8[k_KQ_0/WARP_SIZE];
74
 
75
- const int sumi = __dp4a(v, u, 0);
76
 
77
  #ifdef FP16_AVAILABLE
78
  if (std::is_same<T, half>::value) {
@@ -90,19 +89,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
90
  }
91
 
92
  return sum;
93
- #else
94
- GGML_UNUSED(K_c);
95
- GGML_UNUSED(Q_v);
96
- GGML_UNUSED(Q_q8);
97
- GGML_UNUSED(Q_ds_v);
98
- NO_DEVICE_CODE;
99
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
100
  }
101
 
102
  template<typename T, int D>
103
  static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
104
  const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
105
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
106
 
107
  const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
108
  GGML_UNUSED(Q_v);
@@ -120,7 +111,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
120
  const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
121
  const int u = Q_q8[k_KQ_0/WARP_SIZE];
122
 
123
- const int sumi = __dp4a(v, u, 0);
124
 
125
  #ifdef FP16_AVAILABLE
126
  if (std::is_same<T, half>::value) {
@@ -142,19 +133,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
142
  }
143
 
144
  return sum;
145
- #else
146
- GGML_UNUSED(K_c);
147
- GGML_UNUSED(Q_v);
148
- GGML_UNUSED(Q_q8);
149
- GGML_UNUSED(Q_ds_v);
150
- NO_DEVICE_CODE;
151
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
152
  }
153
 
154
  template<typename T, int D>
155
  static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
156
  const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
157
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
158
 
159
  const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
160
  GGML_UNUSED(Q_v);
@@ -179,7 +162,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
179
 
180
  const int u = Q_q8[k_KQ_0/WARP_SIZE];
181
 
182
- const int sumi = __dp4a(v, u, 0);
183
 
184
  #ifdef FP16_AVAILABLE
185
  if (std::is_same<T, half>::value) {
@@ -197,19 +180,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
197
  }
198
 
199
  return sum;
200
- #else
201
- GGML_UNUSED(K_c);
202
- GGML_UNUSED(Q_v);
203
- GGML_UNUSED(Q_q8);
204
- GGML_UNUSED(Q_ds_v);
205
- NO_DEVICE_CODE;
206
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
207
  }
208
 
209
  template<typename T, int D>
210
  static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
211
  const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
212
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
213
 
214
  const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
215
  GGML_UNUSED(Q_v);
@@ -234,7 +209,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
234
 
235
  const int u = Q_q8[k_KQ_0/WARP_SIZE];
236
 
237
- const int sumi = __dp4a(v, u, 0);
238
 
239
  #ifdef FP16_AVAILABLE
240
  if (std::is_same<T, half>::value) {
@@ -256,19 +231,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
256
  }
257
 
258
  return sum;
259
- #else
260
- GGML_UNUSED(K_c);
261
- GGML_UNUSED(Q_v);
262
- GGML_UNUSED(Q_q8);
263
- GGML_UNUSED(Q_ds_v);
264
- NO_DEVICE_CODE;
265
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
266
  }
267
 
268
  template <typename T, int D>
269
  static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
270
  const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
271
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
272
 
273
  const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
274
  GGML_UNUSED(Q_v);
@@ -297,13 +264,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
297
  }
298
 
299
  return sum;
300
- #else
301
- GGML_UNUSED(K_c);
302
- GGML_UNUSED(Q_v);
303
- GGML_UNUSED(Q_q8);
304
- GGML_UNUSED(Q_ds_v);
305
- NO_DEVICE_CODE;
306
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
307
  }
308
 
309
  template <typename T, int D>
 
54
  template<typename T, int D>
55
  static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
56
  const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
 
57
 
58
  const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
59
  GGML_UNUSED(Q_v);
60
 
61
+ T sum = 0.0f;
62
 
63
  #pragma unroll
64
  for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
 
71
  const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
72
  const int u = Q_q8[k_KQ_0/WARP_SIZE];
73
 
74
+ const int sumi = ggml_cuda_dp4a(v, u, 0);
75
 
76
  #ifdef FP16_AVAILABLE
77
  if (std::is_same<T, half>::value) {
 
89
  }
90
 
91
  return sum;
 
 
 
 
 
 
 
92
  }
93
 
94
  template<typename T, int D>
95
  static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
96
  const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
 
97
 
98
  const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
99
  GGML_UNUSED(Q_v);
 
111
  const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
112
  const int u = Q_q8[k_KQ_0/WARP_SIZE];
113
 
114
+ const int sumi = ggml_cuda_dp4a(v, u, 0);
115
 
116
  #ifdef FP16_AVAILABLE
117
  if (std::is_same<T, half>::value) {
 
133
  }
134
 
135
  return sum;
 
 
 
 
 
 
 
136
  }
137
 
138
  template<typename T, int D>
139
  static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
140
  const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
 
141
 
142
  const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
143
  GGML_UNUSED(Q_v);
 
162
 
163
  const int u = Q_q8[k_KQ_0/WARP_SIZE];
164
 
165
+ const int sumi = ggml_cuda_dp4a(v, u, 0);
166
 
167
  #ifdef FP16_AVAILABLE
168
  if (std::is_same<T, half>::value) {
 
180
  }
181
 
182
  return sum;
 
 
 
 
 
 
 
183
  }
184
 
185
  template<typename T, int D>
186
  static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
187
  const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
 
188
 
189
  const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
190
  GGML_UNUSED(Q_v);
 
209
 
210
  const int u = Q_q8[k_KQ_0/WARP_SIZE];
211
 
212
+ const int sumi = ggml_cuda_dp4a(v, u, 0);
213
 
214
  #ifdef FP16_AVAILABLE
215
  if (std::is_same<T, half>::value) {
 
231
  }
232
 
233
  return sum;
 
 
 
 
 
 
 
234
  }
235
 
236
  template <typename T, int D>
237
  static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
238
  const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
 
239
 
240
  const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
241
  GGML_UNUSED(Q_v);
 
264
  }
265
 
266
  return sum;
 
 
 
 
 
 
 
267
  }
268
 
269
  template <typename T, int D>
ggml/src/ggml-cuda/mmvq.cu CHANGED
@@ -28,16 +28,22 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
28
 
29
  static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
30
  return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
31
- type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
32
- type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
33
- type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
34
- type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
35
- type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
36
- type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
37
- type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
38
- type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
39
- type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
40
- type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
 
 
 
 
 
 
41
  1;
42
  }
43
 
 
28
 
29
  static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
30
  return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
31
+ type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
32
+ type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
33
+ type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
34
+ type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
35
+ type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
36
+ type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
37
+ type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
38
+ type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
39
+ type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
40
+ type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
41
+ type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
42
+ type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
43
+ type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
44
+ type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
45
+ type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
46
+ type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
47
  1;
48
  }
49
 
ggml/src/ggml-cuda/vecdotq.cuh CHANGED
@@ -1,4 +1,5 @@
1
  #include "common.cuh"
 
2
 
3
  static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
4
  const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
@@ -28,6 +29,18 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
28
  return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
29
  }
30
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
33
  // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
@@ -38,7 +51,6 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
38
  template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
39
  const int * v, const int * u, const float & d4, const half2 & ds8) {
40
 
41
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
42
  int sumi = 0;
43
 
44
  #pragma unroll
@@ -47,17 +59,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
47
  const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
48
 
49
  // SIMD dot product of quantized values
50
- sumi = __dp4a(vi0, u[2*i+0], sumi);
51
- sumi = __dp4a(vi1, u[2*i+1], sumi);
52
  }
53
 
54
  const float2 ds8f = __half22float2(ds8);
55
 
56
  // second part effectively subtracts 8 from each quant value
57
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
58
- #else
59
- NO_DEVICE_CODE;
60
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
61
  }
62
 
63
  #define VDR_Q4_1_Q8_1_MMVQ 2
@@ -66,7 +75,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
66
  template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
67
  const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
68
 
69
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
70
  int sumi = 0;
71
 
72
  #pragma unroll
@@ -75,8 +83,8 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
75
  const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
76
 
77
  // SIMD dot product of quantized values
78
- sumi = __dp4a(vi0, u[2*i+0], sumi);
79
- sumi = __dp4a(vi1, u[2*i+1], sumi);
80
  }
81
 
82
  #ifdef GGML_CUDA_F16
@@ -92,9 +100,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
92
 
93
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
94
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
95
- #else
96
- NO_DEVICE_CODE;
97
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
98
  }
99
 
100
  #define VDR_Q5_0_Q8_1_MMVQ 2
@@ -103,7 +108,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
103
  template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
104
  const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
105
 
106
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
107
  int sumi = 0;
108
 
109
  #pragma unroll
@@ -113,23 +117,20 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
113
  vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
114
  vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
115
  vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
116
- sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
117
 
118
  int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
119
  vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
120
  vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
121
  vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
122
  vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
123
- sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
124
  }
125
 
126
  const float2 ds8f = __half22float2(ds8);
127
 
128
  // second part effectively subtracts 16 from each quant value
129
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
130
- #else
131
- NO_DEVICE_CODE;
132
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
133
  }
134
 
135
  #define VDR_Q5_1_Q8_1_MMVQ 2
@@ -138,7 +139,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
138
  template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
139
  const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
140
 
141
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
142
  int sumi = 0;
143
 
144
  #pragma unroll
@@ -148,14 +148,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
148
  vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
149
  vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
150
  vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
151
- sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
152
 
153
  int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
154
  vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
155
  vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
156
  vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
157
  vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
158
- sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
159
  }
160
 
161
  #ifdef GGML_CUDA_F16
@@ -171,10 +171,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
171
 
172
  // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
173
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
174
-
175
- #else
176
- NO_DEVICE_CODE;
177
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
178
  }
179
 
180
  #define VDR_Q8_0_Q8_1_MMVQ 2
@@ -183,31 +179,26 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
183
  template <typename T, int vdr> static __device__ __forceinline__ T vec_dot_q8_0_q8_1_impl(
184
  const int * v, const int * u, const T & d8_0, const T & d8_1) {
185
 
186
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
187
  int sumi = 0;
188
 
189
  #pragma unroll
190
  for (int i = 0; i < vdr; ++i) {
191
  // SIMD dot product of quantized values
192
- sumi = __dp4a(v[i], u[i], sumi);
193
  }
194
 
195
  return d8_0*d8_1 * ((T) sumi);
196
- #else
197
- NO_DEVICE_CODE;
198
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
199
  }
200
 
201
  template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
202
  const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
203
 
204
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
205
  int sumi = 0;
206
 
207
  #pragma unroll
208
  for (int i = 0; i < vdr; ++i) {
209
  // SIMD dot product of quantized values
210
- sumi = __dp4a(v[i], u[i], sumi);
211
  }
212
 
213
  #ifdef GGML_CUDA_F16
@@ -223,9 +214,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
223
 
224
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
225
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
226
- #else
227
- NO_DEVICE_CODE;
228
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
229
  }
230
 
231
  #define VDR_Q2_K_Q8_1_MMVQ 1
@@ -236,7 +224,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
236
  const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
237
  const half2 & dm2, const float * __restrict__ d8) {
238
 
239
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
240
  float sumf_d = 0.0f;
241
  float sumf_m = 0.0f;
242
 
@@ -246,28 +233,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
246
 
247
  const int vi = (v >> (2*i)) & 0x03030303;
248
 
249
- sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
250
 
251
  // fill int with 4x m
252
  int m = sc >> 4;
253
  m |= m << 8;
254
  m |= m << 16;
255
- sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
256
  }
257
 
258
  const float2 dm2f = __half22float2(dm2);
259
 
260
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
261
- #else
262
- NO_DEVICE_CODE;
263
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
264
  }
265
 
266
  // contiguous u/y values
267
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
268
  const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8) {
269
 
270
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
271
  float sumf_d = 0.0f;
272
  float sumf_m = 0.0f;
273
 
@@ -281,8 +264,8 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
281
  #pragma unroll
282
  for (int i = i0; i < i0 + QI8_1/2; ++i) {
283
  const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303;
284
- sumi_d = __dp4a(vi, u[i], sumi_d); // SIMD dot product
285
- sumi_m = __dp4a(0x01010101, u[i], sumi_m);
286
  }
287
 
288
  sumf_d += dm2f.x * sumi_d;
@@ -290,9 +273,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
290
  }
291
 
292
  return d8*(sumf_d - sumf_m);
293
- #else
294
- NO_DEVICE_CODE;
295
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
296
  }
297
 
298
  #define VDR_Q3_K_Q8_1_MMVQ 1
@@ -303,7 +283,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
303
  const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
304
  const int & scale_offset, const float & d3, const float * __restrict__ d8) {
305
 
306
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
307
  float sumf = 0.0f;
308
 
309
  #pragma unroll
@@ -326,13 +305,10 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
326
 
327
  const int vi = __vsubss4(vil, vih);
328
 
329
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
330
  }
331
 
332
  return d3 * sumf;
333
- #else
334
- NO_DEVICE_CODE;
335
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
336
  }
337
 
338
  // contiguous u/y values
@@ -340,7 +316,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
340
  const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
341
  const float & d3, const float & d8) {
342
 
343
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
344
  int sumi = 0;
345
 
346
  #pragma unroll
@@ -350,16 +325,13 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
350
  #pragma unroll
351
  for (int i = i0; i < i0 + QI8_1/2; ++i) {
352
  const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404);
353
- sumi_sc = __dp4a(vi, u[i], sumi_sc); // SIMD dot product
354
  }
355
 
356
  sumi += sumi_sc * scales[i0 / (QI8_1/2)];
357
  }
358
 
359
  return d3*d8 * sumi;
360
- #else
361
- NO_DEVICE_CODE;
362
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
363
  }
364
 
365
  #define VDR_Q4_K_Q8_1_MMVQ 2
@@ -370,7 +342,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
370
  const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
371
  const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
372
 
373
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
374
  float sumf_d = 0.0f;
375
  float sumf_m = 0.0f;
376
 
@@ -379,8 +350,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
379
  const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
380
  const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
381
 
382
- const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
383
- const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
384
 
385
  sumf_d += d8[i] * (dot1 * sc[i]);
386
  sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
@@ -389,10 +360,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
389
  const float2 dm4f = __half22float2(dm4);
390
 
391
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
392
-
393
- #else
394
- NO_DEVICE_CODE;
395
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
396
  }
397
 
398
  // contiguous u/y values
@@ -400,7 +367,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
400
  const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
401
  const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
402
 
403
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
404
  float sumf_d = 0.0f;
405
  float sumf_m = 0.0f;
406
 
@@ -410,7 +376,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
410
 
411
  #pragma unroll
412
  for (int j = 0; j < QI8_1; ++j) {
413
- sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
414
  }
415
 
416
  const float2 ds8f = __half22float2(ds8[i]);
@@ -422,10 +388,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
422
  const float2 dm4f = __half22float2(dm4);
423
 
424
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
425
-
426
- #else
427
- NO_DEVICE_CODE;
428
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
429
  }
430
 
431
  #define VDR_Q5_K_Q8_1_MMVQ 2
@@ -436,7 +398,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
436
  const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
437
  const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
438
 
439
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
440
  float sumf_d = 0.0f;
441
  float sumf_m = 0.0f;
442
 
@@ -451,8 +412,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
451
  const int v0i = vl0i | vh0i;
452
  const int v1i = vl1i | vh1i;
453
 
454
- const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
455
- const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
456
 
457
  sumf_d += d8[i] * (dot1 * sc[i]);
458
  sumf_m += d8[i] * (dot2 * m[i]);
@@ -462,10 +423,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
462
  const float2 dm5f = __half22float2(dm5);
463
 
464
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
465
-
466
- #else
467
- NO_DEVICE_CODE;
468
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
469
  }
470
 
471
  // contiguous u/y values
@@ -473,7 +430,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
473
  const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
474
  const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
475
 
476
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
477
  float sumf_d = 0.0f;
478
  float sumf_m = 0.0f;
479
 
@@ -483,7 +439,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
483
 
484
  #pragma unroll
485
  for (int j = 0; j < QI8_1; ++j) {
486
- sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
487
  }
488
 
489
  const float2 ds8f = __half22float2(ds8[i]);
@@ -495,10 +451,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
495
  const float2 dm4f = __half22float2(dm4);
496
 
497
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
498
-
499
- #else
500
- NO_DEVICE_CODE;
501
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
502
  }
503
 
504
  #define VDR_Q6_K_Q8_1_MMVQ 1
@@ -509,7 +461,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
509
  const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
510
  const float & d, const float * __restrict__ d8) {
511
 
512
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
513
  float sumf = 0.0f;
514
 
515
  #pragma unroll
@@ -522,13 +473,10 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
522
 
523
  const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
524
 
525
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
526
  }
527
 
528
  return d*sumf;
529
- #else
530
- NO_DEVICE_CODE;
531
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
532
  }
533
 
534
  // contiguous u/y values
@@ -536,7 +484,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
536
  const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
537
  const float & d6, const float * __restrict__ d8) {
538
 
539
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
540
  float sumf_d = 0.0f;
541
 
542
  #pragma unroll
@@ -545,21 +492,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
545
 
546
  #pragma unroll
547
  for (int i = i0; i < i0 + 2; ++i) {
548
- sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
549
- sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
550
 
551
- sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
552
- sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
553
  }
554
 
555
  sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
556
  }
557
 
558
  return d6 * sumf_d;
559
-
560
- #else
561
- NO_DEVICE_CODE;
562
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
563
  }
564
 
565
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
@@ -572,9 +515,9 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
572
 
573
  #pragma unroll
574
  for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
575
- v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
576
- u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
577
- u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
578
  }
579
 
580
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
@@ -591,9 +534,9 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
591
 
592
  #pragma unroll
593
  for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
594
- v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
595
- u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
596
- u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
597
  }
598
 
599
  return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
@@ -610,10 +553,10 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
610
 
611
  #pragma unroll
612
  for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
613
- vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
614
- vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
615
- u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
616
- u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
617
  }
618
 
619
  return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
@@ -630,10 +573,10 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
630
 
631
  #pragma unroll
632
  for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
633
- vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
634
- vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
635
- u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
636
- u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
637
  }
638
 
639
  return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
@@ -649,8 +592,8 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
649
 
650
  #pragma unroll
651
  for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
652
- v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
653
- u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
654
  }
655
 
656
  return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
@@ -666,13 +609,13 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
666
 
667
  const uint8_t * scales = bq2_K->scales + scale_offset;
668
 
669
- const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
670
  int u[QR2_K];
671
  float d8[QR2_K];
672
 
673
  #pragma unroll
674
  for (int i = 0; i < QR2_K; ++ i) {
675
- u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
676
  d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
677
  }
678
 
@@ -689,17 +632,17 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
689
 
690
  const float d = bq3_K->d;
691
 
692
- const int vl = get_int_from_uint8(bq3_K->qs, iqs);
693
 
694
  // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
695
- const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
696
 
697
  int u[QR3_K];
698
  float d8[QR3_K];
699
 
700
  #pragma unroll
701
  for (int i = 0; i < QR3_K; ++i) {
702
- u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
703
  d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
704
  }
705
 
@@ -807,8 +750,8 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
807
  const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
808
  const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
809
 
810
- const int vl = get_int_from_uint8(bq6_K->ql, iqs);
811
- const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
812
 
813
  const int8_t * scales = bq6_K->scales + scale_offset;
814
 
@@ -817,335 +760,342 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
817
 
818
  #pragma unroll
819
  for (int i = 0; i < QR6_K; ++i) {
820
- u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
821
  d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
822
  }
823
 
824
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
825
  }
826
 
 
 
827
  static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
828
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
829
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx;
830
 
831
- #if QR2_XXS == 8
832
- const int ib32 = iqs;
833
- const uint16_t * q2 = bq2->qs + 4*ib32;
834
- const uint8_t * aux8 = (const uint8_t *)q2;
835
- const int8_t * q8 = bq8_1[ib32].qs;
836
- uint32_t aux32 = q2[2] | (q2[3] << 16);
837
  int sumi = 0;
838
- for (int l = 0; l < 4; ++l) {
839
- const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
840
- const uint8_t signs = ksigns_iq2xs[aux32 & 127];
841
- for (int j = 0; j < 8; ++j) {
842
- sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
843
- }
844
- q8 += 8;
845
- aux32 >>= 7;
 
 
 
 
 
 
846
  }
847
- const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
 
 
 
848
  return d * sumi;
849
- #else
850
- // iqs is 0...15
851
- const int ib32 = iqs/2;
852
- const int il = iqs%2;
853
- const uint16_t * q2 = bq2->qs + 4*ib32;
854
- const uint8_t * aux8 = (const uint8_t *)q2;
855
- const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
856
- const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
857
- const uint32_t aux32 = q2[2] | (q2[3] << 16);
858
- const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
859
- const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
860
- const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
861
- const int8_t * q8 = bq8_1[ib32].qs + 16*il;
862
- int sumi1 = 0, sumi2 = 0;
863
- for (int j = 0; j < 8; ++j) {
864
- sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
865
- sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
866
- }
867
- return d * (sumi1 + sumi2);
868
- #endif
869
  }
870
 
 
 
871
  static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
872
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
873
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
874
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx;
875
 
876
- const int ib32 = iqs;
877
- const uint16_t * q2 = bq2->qs + 4*ib32;
878
- const int8_t * q8 = bq8_1[ib32].qs;
879
- const uint8_t ls1 = bq2->scales[ib32] & 0xf;
880
- const uint8_t ls2 = bq2->scales[ib32] >> 4;
 
881
  int sumi1 = 0;
882
- for (int l = 0; l < 2; ++l) {
883
- const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
884
- const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
885
- const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
886
- const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
887
- sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
888
- sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
889
- q8 += 8;
890
- }
891
- int sumi2 = 0;
892
- for (int l = 2; l < 4; ++l) {
893
- const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
894
- const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
895
- const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
896
- const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
897
- sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
898
- sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
899
- q8 += 8;
900
  }
901
- const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
902
- return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
903
- #else
904
- GGML_UNUSED(ksigns64);
905
- NO_DEVICE_CODE;
906
- #endif
907
  }
908
 
909
- // TODO
 
910
  static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
911
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
912
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
913
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx;
914
 
915
- const int ib32 = iqs;
916
- const int8_t * q8 = bq8_1[ib32].qs;
917
- const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
918
- const uint8_t ls1 = bq2->scales[ib32] & 0xf;
919
- const uint8_t ls2 = bq2->scales[ib32] >> 4;
 
 
 
 
 
 
 
920
  int sumi1 = 0;
921
- for (int l = 0; l < 2; ++l) {
922
- const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
923
- const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
924
- const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
925
- const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
926
- const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
927
- sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
928
- sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
929
- q8 += 8;
930
- }
931
- int sumi2 = 0;
932
- for (int l = 2; l < 4; ++l) {
933
- const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
934
- const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
935
- const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
936
- const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
937
- const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
938
- sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
939
- sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
940
- q8 += 8;
941
  }
942
- const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
943
- return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
944
- #else
945
- GGML_UNUSED(ksigns64);
946
- NO_DEVICE_CODE;
947
- #endif
948
  }
949
 
 
 
950
  static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
951
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
952
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
953
- const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq + kbx;
954
-
955
- const int ib32 = iqs;
956
- const uint8_t * q3 = bq2->qs + 8*ib32;
957
- const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
958
- const int8_t * q8 = bq8_1[ib32].qs;
959
- uint32_t aux32 = gas[0] | (gas[1] << 16);
960
  int sumi = 0;
961
- for (int l = 0; l < 4; ++l) {
962
- const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
963
- const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
964
- const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
965
- const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
966
- const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
967
- sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
968
- sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
969
- q8 += 8;
970
- aux32 >>= 7;
 
 
 
 
971
  }
972
- const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
 
 
 
973
  return d * sumi;
974
- #else
975
- NO_DEVICE_CODE;
976
- #endif
977
  }
978
 
 
 
979
  // TODO: don't use lookup table for signs
980
  static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
981
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
982
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
983
- const block_iq3_s * bq2 = (const block_iq3_s *) vbq + kbx;
984
 
985
- const int ib32 = iqs;
986
- const uint8_t * qs = bq2->qs + 8*ib32;
987
- const int8_t * q8 = bq8_1[ib32].qs;
 
 
 
 
 
 
 
988
  int sumi = 0;
989
- for (int l = 0; l < 4; ++l) {
990
- const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
991
- const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
992
- uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
993
- uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
994
- const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
995
- const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
996
- sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
997
- sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
998
- q8 += 8;
 
 
 
 
 
 
 
999
  }
1000
- const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
 
 
 
1001
  return d * sumi;
1002
- #else
1003
- NO_DEVICE_CODE;
1004
- #endif
1005
  }
1006
 
1007
  static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
1008
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
1009
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx;
1010
 
1011
- const int ib32 = iqs;
 
 
 
 
1012
  int sumi = 0;
1013
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1014
- const int * q8 = (const int *)bq8_1[ib32].qs;
1015
- for (int l = 0; l < 4; ++l) {
1016
- const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
1017
- int grid0 = grid[0] & 0x0f0f0f0f;
1018
- int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
1019
- sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
1020
- }
1021
- #else
1022
- const int8_t * q8 = bq8_1[ib32].qs;
1023
- for (int l = 0; l < 4; ++l) {
1024
- const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
1025
- for (int j = 0; j < 4; ++j) {
1026
- sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
1027
- }
1028
- q8 += 8;
1029
  }
1030
- #endif
1031
- const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
1032
- const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
1033
- const float d = d1q * __low2float (bq8_1[ib32].ds);
1034
- const float m = d1q * __high2float(bq8_1[ib32].ds);
1035
- return d * sumi + m * delta;
1036
  }
1037
 
1038
  static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
1039
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
1040
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx;
1041
 
1042
- const int ib32 = iqs;
1043
- int sumi[2] = {0, 0};
1044
- float sumf[2] = {0.f, 0.f};
1045
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1046
- const int * q8 = (const int *)bq8_1[ib32].qs;
1047
- for (int l = 0; l < 4; ++l) {
1048
- const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
1049
- int grid0 = grid[0] & 0x0f0f0f0f;
1050
- int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
1051
- sumi[l/2] = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi[l/2]));
1052
- const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
1053
- const int sumy = __dp4a(q8[2*l+1], 0x01010101, __dp4a(q8[2*l+0], 0x01010101, 0));
1054
- sumf[l/2] += delta*sumy;
1055
- }
1056
- #else
1057
- const int8_t * q8 = bq8_1[ib32].qs;
1058
- for (int l = 0; l < 4; ++l) {
1059
- const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
 
 
 
1060
  int sumy = 0;
1061
- for (int j = 0; j < 4; ++j) {
1062
- sumi[l/2] += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
1063
- sumy += q8[j] + q8[j+4];
1064
- }
1065
- const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
1066
- sumf[l/2] += delta*sumy;
1067
- q8 += 8;
1068
  }
1069
- #endif
 
 
1070
  iq1m_scale_t scale;
1071
- const uint16_t * sc = (const uint16_t *)bq1->scales;
1072
- scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
1073
- const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
1074
- return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
 
 
 
1075
  }
1076
 
1077
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1078
- static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
1079
- int & val1, int & val2) {
1080
-
1081
- uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
1082
- aux32 = q4 & 0x0f0f0f0f;
1083
- uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
1084
- uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
1085
- val1 = v1 | (v2 << 16);
1086
- aux32 = (q4 >> 4) & 0x0f0f0f0f;
1087
- v1 = values[q8[0]] | (values[q8[1]] << 8);
1088
- v2 = values[q8[2]] | (values[q8[3]] << 8);
1089
- val2 = v1 | (v2 << 16);
1090
  }
1091
- #endif
 
1092
 
1093
  static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
1094
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
1095
 
1096
- const block_iq4_nl * bq = (const block_iq4_nl *) vbq + kbx;
1097
 
1098
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1099
- const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
1100
- const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
1101
 
1102
- const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
1103
-
1104
- int v1, v2;
1105
- int sumi1 = 0, sumi2 = 0;
1106
  for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
1107
- const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
1108
- get_int_from_table_16(aux, values, v1, v2);
1109
- sumi1 = __dp4a(v1, q8[l+0], sumi1);
1110
- sumi2 = __dp4a(v2, q8[l+4], sumi2);
1111
- }
1112
-
1113
- #else
1114
- const uint8_t * q4 = bq->qs + 4*iqs;
1115
- const int8_t * q8 = bq8_1->qs + 4*iqs;
1116
 
1117
- int sumi1 = 0, sumi2 = 0;
1118
- for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
1119
- sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
1120
- sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4];
1121
  }
1122
- #endif
1123
- const float d = (float)bq->d * __low2float(bq8_1->ds);
1124
- return d * (sumi1 + sumi2);
1125
  }
1126
 
 
 
1127
  static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
1128
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
1129
 
1130
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1131
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx;
1132
- const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
1133
-
1134
- // iqs is 0...7
1135
- const int ib32 = iqs;
1136
- const int32_t * q8 = (const int *)bq8_1[ib32].qs;
1137
- const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
1138
- const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
1139
- const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
1140
- int v1, v2;
1141
- int sumi1 = 0, sumi2 = 0;
1142
  for (int j = 0; j < 4; ++j) {
1143
- get_int_from_table_16(q4[j], values, v1, v2);
1144
- sumi1 = __dp4a(v1, q8[j+0], sumi1);
1145
- sumi2 = __dp4a(v2, q8[j+4], sumi2);
 
 
 
 
 
1146
  }
1147
- return d * (sumi1 + sumi2);
1148
- #else
1149
- return vec_dot_iq4_xs_q8_1(vbq, bq8_1, kbx, iqs);
1150
- #endif
 
 
1151
  }
 
1
  #include "common.cuh"
2
+ #include <cstdint>
3
 
4
  static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
5
  const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
 
29
  return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
30
  }
31
 
32
+ static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) {
33
+ const uint16_t * x16 = (const uint16_t *) x;
34
+
35
+ int x32 = x16[2*i32 + 0] << 0;
36
+ x32 |= x16[2*i32 + 1] << 16;
37
+
38
+ return x32;
39
+ }
40
+
41
+ static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) {
42
+ return ((const int *) x)[i32]; // assume at least 4 byte alignment
43
+ }
44
 
45
  // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
46
  // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
 
51
  template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
52
  const int * v, const int * u, const float & d4, const half2 & ds8) {
53
 
 
54
  int sumi = 0;
55
 
56
  #pragma unroll
 
59
  const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
60
 
61
  // SIMD dot product of quantized values
62
+ sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
63
+ sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
64
  }
65
 
66
  const float2 ds8f = __half22float2(ds8);
67
 
68
  // second part effectively subtracts 8 from each quant value
69
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
 
 
 
70
  }
71
 
72
  #define VDR_Q4_1_Q8_1_MMVQ 2
 
75
  template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
76
  const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
77
 
 
78
  int sumi = 0;
79
 
80
  #pragma unroll
 
83
  const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
84
 
85
  // SIMD dot product of quantized values
86
+ sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
87
+ sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
88
  }
89
 
90
  #ifdef GGML_CUDA_F16
 
100
 
101
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
102
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
 
 
 
103
  }
104
 
105
  #define VDR_Q5_0_Q8_1_MMVQ 2
 
108
  template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
109
  const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
110
 
 
111
  int sumi = 0;
112
 
113
  #pragma unroll
 
117
  vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
118
  vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
119
  vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
120
+ sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
121
 
122
  int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
123
  vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
124
  vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
125
  vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
126
  vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
127
+ sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
128
  }
129
 
130
  const float2 ds8f = __half22float2(ds8);
131
 
132
  // second part effectively subtracts 16 from each quant value
133
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
 
 
 
134
  }
135
 
136
  #define VDR_Q5_1_Q8_1_MMVQ 2
 
139
  template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
140
  const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
141
 
 
142
  int sumi = 0;
143
 
144
  #pragma unroll
 
148
  vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
149
  vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
150
  vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
151
+ sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
152
 
153
  int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
154
  vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
155
  vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
156
  vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
157
  vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
158
+ sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
159
  }
160
 
161
  #ifdef GGML_CUDA_F16
 
171
 
172
  // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
173
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
 
 
 
 
174
  }
175
 
176
  #define VDR_Q8_0_Q8_1_MMVQ 2
 
179
  template <typename T, int vdr> static __device__ __forceinline__ T vec_dot_q8_0_q8_1_impl(
180
  const int * v, const int * u, const T & d8_0, const T & d8_1) {
181
 
 
182
  int sumi = 0;
183
 
184
  #pragma unroll
185
  for (int i = 0; i < vdr; ++i) {
186
  // SIMD dot product of quantized values
187
+ sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
188
  }
189
 
190
  return d8_0*d8_1 * ((T) sumi);
 
 
 
191
  }
192
 
193
  template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
194
  const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
195
 
 
196
  int sumi = 0;
197
 
198
  #pragma unroll
199
  for (int i = 0; i < vdr; ++i) {
200
  // SIMD dot product of quantized values
201
+ sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
202
  }
203
 
204
  #ifdef GGML_CUDA_F16
 
214
 
215
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
216
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 
 
 
217
  }
218
 
219
  #define VDR_Q2_K_Q8_1_MMVQ 1
 
224
  const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
225
  const half2 & dm2, const float * __restrict__ d8) {
226
 
 
227
  float sumf_d = 0.0f;
228
  float sumf_m = 0.0f;
229
 
 
233
 
234
  const int vi = (v >> (2*i)) & 0x03030303;
235
 
236
+ sumf_d += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
237
 
238
  // fill int with 4x m
239
  int m = sc >> 4;
240
  m |= m << 8;
241
  m |= m << 16;
242
+ sumf_m += d8[i] * ggml_cuda_dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
243
  }
244
 
245
  const float2 dm2f = __half22float2(dm2);
246
 
247
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
 
 
 
248
  }
249
 
250
  // contiguous u/y values
251
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
252
  const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8) {
253
 
 
254
  float sumf_d = 0.0f;
255
  float sumf_m = 0.0f;
256
 
 
264
  #pragma unroll
265
  for (int i = i0; i < i0 + QI8_1/2; ++i) {
266
  const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303;
267
+ sumi_d = ggml_cuda_dp4a(vi, u[i], sumi_d); // SIMD dot product
268
+ sumi_m = ggml_cuda_dp4a(0x01010101, u[i], sumi_m);
269
  }
270
 
271
  sumf_d += dm2f.x * sumi_d;
 
273
  }
274
 
275
  return d8*(sumf_d - sumf_m);
 
 
 
276
  }
277
 
278
  #define VDR_Q3_K_Q8_1_MMVQ 1
 
283
  const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
284
  const int & scale_offset, const float & d3, const float * __restrict__ d8) {
285
 
 
286
  float sumf = 0.0f;
287
 
288
  #pragma unroll
 
305
 
306
  const int vi = __vsubss4(vil, vih);
307
 
308
+ sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product
309
  }
310
 
311
  return d3 * sumf;
 
 
 
312
  }
313
 
314
  // contiguous u/y values
 
316
  const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
317
  const float & d3, const float & d8) {
318
 
 
319
  int sumi = 0;
320
 
321
  #pragma unroll
 
325
  #pragma unroll
326
  for (int i = i0; i < i0 + QI8_1/2; ++i) {
327
  const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404);
328
+ sumi_sc = ggml_cuda_dp4a(vi, u[i], sumi_sc); // SIMD dot product
329
  }
330
 
331
  sumi += sumi_sc * scales[i0 / (QI8_1/2)];
332
  }
333
 
334
  return d3*d8 * sumi;
 
 
 
335
  }
336
 
337
  #define VDR_Q4_K_Q8_1_MMVQ 2
 
342
  const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
343
  const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
344
 
 
345
  float sumf_d = 0.0f;
346
  float sumf_m = 0.0f;
347
 
 
350
  const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
351
  const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
352
 
353
+ const int dot1 = ggml_cuda_dp4a(v1i, u[2*i+1], ggml_cuda_dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
354
+ const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+1], ggml_cuda_dp4a(0x01010101, u[2*i+0], 0)); // sum of u
355
 
356
  sumf_d += d8[i] * (dot1 * sc[i]);
357
  sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
 
360
  const float2 dm4f = __half22float2(dm4);
361
 
362
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
 
 
 
 
363
  }
364
 
365
  // contiguous u/y values
 
367
  const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
368
  const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
369
 
 
370
  float sumf_d = 0.0f;
371
  float sumf_m = 0.0f;
372
 
 
376
 
377
  #pragma unroll
378
  for (int j = 0; j < QI8_1; ++j) {
379
+ sumi_d = ggml_cuda_dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
380
  }
381
 
382
  const float2 ds8f = __half22float2(ds8[i]);
 
388
  const float2 dm4f = __half22float2(dm4);
389
 
390
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
 
 
 
 
391
  }
392
 
393
  #define VDR_Q5_K_Q8_1_MMVQ 2
 
398
  const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
399
  const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
400
 
 
401
  float sumf_d = 0.0f;
402
  float sumf_m = 0.0f;
403
 
 
412
  const int v0i = vl0i | vh0i;
413
  const int v1i = vl1i | vh1i;
414
 
415
+ const int dot1 = ggml_cuda_dp4a(v0i, u[2*i+0], ggml_cuda_dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
416
+ const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+0], ggml_cuda_dp4a(0x01010101, u[2*i+1], 0)); // sum of u
417
 
418
  sumf_d += d8[i] * (dot1 * sc[i]);
419
  sumf_m += d8[i] * (dot2 * m[i]);
 
423
  const float2 dm5f = __half22float2(dm5);
424
 
425
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
 
 
 
 
426
  }
427
 
428
  // contiguous u/y values
 
430
  const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
431
  const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
432
 
 
433
  float sumf_d = 0.0f;
434
  float sumf_m = 0.0f;
435
 
 
439
 
440
  #pragma unroll
441
  for (int j = 0; j < QI8_1; ++j) {
442
+ sumi_d = ggml_cuda_dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
443
  }
444
 
445
  const float2 ds8f = __half22float2(ds8[i]);
 
451
  const float2 dm4f = __half22float2(dm4);
452
 
453
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
 
 
 
 
454
  }
455
 
456
  #define VDR_Q6_K_Q8_1_MMVQ 1
 
461
  const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
462
  const float & d, const float * __restrict__ d8) {
463
 
 
464
  float sumf = 0.0f;
465
 
466
  #pragma unroll
 
473
 
474
  const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
475
 
476
+ sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product
477
  }
478
 
479
  return d*sumf;
 
 
 
480
  }
481
 
482
  // contiguous u/y values
 
484
  const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
485
  const float & d6, const float * __restrict__ d8) {
486
 
 
487
  float sumf_d = 0.0f;
488
 
489
  #pragma unroll
 
492
 
493
  #pragma unroll
494
  for (int i = i0; i < i0 + 2; ++i) {
495
+ sumi_d.x = ggml_cuda_dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
496
+ sumi_d.x = ggml_cuda_dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
497
 
498
+ sumi_d.y = ggml_cuda_dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
499
+ sumi_d.y = ggml_cuda_dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
500
  }
501
 
502
  sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
503
  }
504
 
505
  return d6 * sumf_d;
 
 
 
 
506
  }
507
 
508
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
 
515
 
516
  #pragma unroll
517
  for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
518
+ v[i] = get_int_b2(bq4_0->qs, iqs + i);
519
+ u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
520
+ u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_0);
521
  }
522
 
523
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
 
534
 
535
  #pragma unroll
536
  for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
537
+ v[i] = get_int_b4(bq4_1->qs, iqs + i);
538
+ u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
539
+ u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_1);
540
  }
541
 
542
  return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
 
553
 
554
  #pragma unroll
555
  for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
556
+ vl[i] = get_int_b2(bq5_0->qs, iqs + i);
557
+ vh[i] = get_int_b2(bq5_0->qh, 0) >> (4 * (iqs + i));
558
+ u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
559
+ u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_0);
560
  }
561
 
562
  return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
 
573
 
574
  #pragma unroll
575
  for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
576
+ vl[i] = get_int_b4(bq5_1->qs, iqs + i);
577
+ vh[i] = get_int_b4(bq5_1->qh, 0) >> (4 * (iqs + i));
578
+ u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
579
+ u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_1);
580
  }
581
 
582
  return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
 
592
 
593
  #pragma unroll
594
  for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
595
+ v[i] = get_int_b2(bq8_0->qs, iqs + i);
596
+ u[i] = get_int_b4(bq8_1->qs, iqs + i);
597
  }
598
 
599
  return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
 
609
 
610
  const uint8_t * scales = bq2_K->scales + scale_offset;
611
 
612
+ const int v = get_int_b4(bq2_K->qs, iqs);
613
  int u[QR2_K];
614
  float d8[QR2_K];
615
 
616
  #pragma unroll
617
  for (int i = 0; i < QR2_K; ++ i) {
618
+ u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
619
  d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
620
  }
621
 
 
632
 
633
  const float d = bq3_K->d;
634
 
635
+ const int vl = get_int_b2(bq3_K->qs, iqs);
636
 
637
  // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
638
+ const int vh = ~get_int_b2(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
639
 
640
  int u[QR3_K];
641
  float d8[QR3_K];
642
 
643
  #pragma unroll
644
  for (int i = 0; i < QR3_K; ++i) {
645
+ u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
646
  d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
647
  }
648
 
 
750
  const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
751
  const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
752
 
753
+ const int vl = get_int_b2(bq6_K->ql, iqs);
754
+ const int vh = get_int_b2(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
755
 
756
  const int8_t * scales = bq6_K->scales + scale_offset;
757
 
 
760
 
761
  #pragma unroll
762
  for (int i = 0; i < QR6_K; ++i) {
763
+ u[i] = get_int_b4(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
764
  d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
765
  }
766
 
767
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
768
  }
769
 
770
+ #define VDR_IQ2_XXS_Q8_1_MMVQ 2
771
+
772
  static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
773
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
774
+
775
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx;
776
 
777
+ const int q2 = get_int_b2(bq2->qs, iqs);
778
+ const uint8_t * aux8 = (const uint8_t *) &q2;
779
+ const uint32_t aux32 = get_int_b2(bq2->qs, iqs + 1);
780
+
 
 
781
  int sumi = 0;
782
+ #pragma unroll
783
+ for (int k0 = 0; k0 < 8; k0 += 2) {
784
+ const int * grid_pos = (const int *) (iq2xxs_grid + aux8[k0/2]);
785
+ const int signs_packed = ksigns_iq2xs[(aux32 >> (7*k0/2)) & 0x7F];
786
+
787
+ const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
788
+ const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
789
+ const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0);
790
+ sumi = ggml_cuda_dp4a(grid0, u0, sumi);
791
+
792
+ const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
793
+ const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
794
+ const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1);
795
+ sumi = ggml_cuda_dp4a(grid1, u1, sumi);
796
  }
797
+
798
+ const int ls = aux32 >> 28;
799
+ sumi = (ls*sumi + sumi/2)/4;
800
+ const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
801
  return d * sumi;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
802
  }
803
 
804
+ #define VDR_IQ2_XS_Q8_1_MMVQ 2
805
+
806
  static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
807
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
808
+
809
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx;
810
 
811
+ const int2 q2_packed = make_int2(get_int_b2(bq2->qs, iqs + 0), get_int_b2(bq2->qs, iqs + 1));
812
+ const uint16_t * q2 = (const uint16_t *) &q2_packed;
813
+ const int ls0 = bq2->scales[iqs/2] & 0x0F;
814
+ const int ls1 = bq2->scales[iqs/2] >> 4;
815
+
816
+ int sumi0 = 0;
817
  int sumi1 = 0;
818
+ #pragma unroll
819
+ for (int l0 = 0; l0 < 8; l0 += 2) {
820
+ const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l0/2] & 0x000001FF));
821
+ const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l0/2] >> 9));
822
+
823
+ const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
824
+ const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
825
+
826
+ const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
827
+ const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
828
+
829
+ if (l0 < 4) {
830
+ sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0);
831
+ sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0);
832
+ } else {
833
+ sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1);
834
+ sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1);
835
+ }
836
  }
837
+ const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
838
+ const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
839
+ return d * sumi;
 
 
 
840
  }
841
 
842
+ #define VDR_IQ2_S_Q8_1_MMVQ 2
843
+
844
  static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
845
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
846
+
847
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx;
848
 
849
+ const int qs_packed = get_int_b2(bq2->qs, iqs/2);
850
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
851
+
852
+ const int qh = bq2->qh[iqs/2];
853
+
854
+ const int signs_packed_32 = get_int_b2(bq2->qs, QK_K/32 + iqs/2);
855
+ const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
856
+
857
+ const int ls0 = bq2->scales[iqs/2] & 0x0F;
858
+ const int ls1 = bq2->scales[iqs/2] >> 4;
859
+
860
+ int sumi0 = 0;
861
  int sumi1 = 0;
862
+ #pragma unroll
863
+ for (int l0 = 0; l0 < 8; l0 += 2) {
864
+ const int * grid_pos = (const int *)(iq2s_grid + (qs[l0/2] | ((qh << (8-l0)) & 0x300)));
865
+
866
+ const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000);
867
+ const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000);
868
+
869
+ const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0);
870
+ const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1);
871
+
872
+ const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
873
+ const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
874
+
875
+ if (l0 < 4) {
876
+ sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0);
877
+ sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0);
878
+ } else {
879
+ sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1);
880
+ sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1);
881
+ }
882
  }
883
+ const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
884
+
885
+ const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
886
+ return d * sumi;
 
 
887
  }
888
 
889
+ #define VDR_IQ3_XXS_Q8_1_MMVQ 2
890
+
891
  static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
892
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
893
+
894
+ const block_iq3_xxs * bq3 = (const block_iq3_xxs *) vbq + kbx;
895
+
896
+ const int2 q3_packed = make_int2(get_int_b2(bq3->qs, iqs), get_int_b2(bq3->qs, iqs+1));
897
+ const uint8_t * q3 = (const uint8_t *) &q3_packed;
898
+ const uint32_t aux32 = get_int_b2(bq3->qs, QK_K/16 + iqs/2);
899
+
 
900
  int sumi = 0;
901
+ #pragma unroll
902
+ for (int l0 = 0; l0 < 8; l0 += 2) {
903
+ const int2 grid_pos = make_int2(iq3xxs_grid[q3[l0 + 0]], iq3xxs_grid[q3[l0 + 1]]);
904
+
905
+ const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l0/2)) & 0x7F));
906
+
907
+ const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
908
+ const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
909
+
910
+ const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
911
+ const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
912
+
913
+ sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
914
+ sumi = ggml_cuda_dp4a(grid_h, u1, sumi);
915
  }
916
+
917
+ const int ls = aux32 >> 28;
918
+ sumi = (ls*sumi + sumi/2)/2;
919
+ const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
920
  return d * sumi;
 
 
 
921
  }
922
 
923
+ #define VDR_IQ3_S_Q8_1_MMVQ 2
924
+
925
  // TODO: don't use lookup table for signs
926
  static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
927
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
 
928
 
929
+ const block_iq3_s * bq3 = (const block_iq3_s *) vbq + kbx;
930
+
931
+ const int2 qs_packed = make_int2(get_int_b2(bq3->qs, iqs + 0), get_int_b2(bq3->qs, iqs + 1));
932
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
933
+
934
+ const int qh = bq3->qh[iqs/2];
935
+
936
+ const int signs_packed_32 = get_int_b2(bq3->signs, iqs/2);
937
+ const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
938
+
939
  int sumi = 0;
940
+ #pragma unroll
941
+ for (int l0 = 0; l0 < 8; l0 += 2) {
942
+ const int2 grid_pos = make_int2(
943
+ iq3s_grid[qs[l0 + 0] | ((qh << (8 - l0)) & 0x100)],
944
+ iq3s_grid[qs[l0 + 1] | ((qh << (7 - l0)) & 0x100)]);
945
+
946
+ const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000);
947
+ const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000);
948
+
949
+ const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
950
+ const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
951
+
952
+ const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
953
+ const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
954
+
955
+ sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
956
+ sumi = ggml_cuda_dp4a(grid_h, u1, sumi);
957
  }
958
+
959
+ sumi *= 1 + 2*((bq3->scales[iqs/4] >> ((iqs << 1) & 0x04)) & 0x0F);
960
+
961
+ const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
962
  return d * sumi;
 
 
 
963
  }
964
 
965
  static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
966
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
967
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx;
968
 
969
+ const int qs_packed = get_int_b2(bq1->qs, iqs);
970
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
971
+
972
+ const int qh = bq1->qh[iqs];
973
+
974
  int sumi = 0;
975
+ #pragma unroll
976
+ for (int l0 = 0; l0 < 8; l0 += 2) {
977
+ const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)];
978
+
979
+ const int grid0 = (grid >> 0) & 0x0F0F0F0F;
980
+ const int grid1 = (grid >> 4) & 0x0F0F0F0F;
981
+
982
+ const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
983
+ const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
984
+
985
+ sumi = ggml_cuda_dp4a(grid0, u0, sumi);
986
+ sumi = ggml_cuda_dp4a(grid1, u1, sumi);
 
 
 
 
987
  }
988
+
989
+ const float d1q = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
990
+ const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
991
+ const float2 ds = __half22float2(bq8_1[iqs].ds);
992
+ return d1q * (ds.x*sumi + ds.y*delta);
 
993
  }
994
 
995
  static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
996
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
997
+
998
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx;
999
 
1000
+ const int qs_packed = get_int_b4(bq1->qs, iqs);
1001
+ const uint8_t * qs = (const uint8_t *) &qs_packed;
1002
+
1003
+ int sumi[2] = {0};
1004
+ float sumf[2] = {0.0f};
1005
+ #pragma unroll
1006
+ for (int l0 = 0; l0 < 8; l0 += 2) {
1007
+ const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2));
1008
+
1009
+ const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)];
1010
+
1011
+ const int grid0 = (grid >> 0) & 0x0F0F0F0F;
1012
+ const int grid1 = (grid >> 4) & 0x0F0F0F0F;
1013
+
1014
+ const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
1015
+ const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
1016
+
1017
+ sumi[l0/4] = ggml_cuda_dp4a(grid0, u0, sumi[l0/4]);
1018
+ sumi[l0/4] = ggml_cuda_dp4a(grid1, u1, sumi[l0/4]);
1019
+
1020
+ const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08);
1021
  int sumy = 0;
1022
+ sumy = ggml_cuda_dp4a(u0, 0x01010101, sumy);
1023
+ sumy = ggml_cuda_dp4a(u1, 0x01010101, sumy);
1024
+ sumf[l0/4] += delta*sumy;
 
 
 
 
1025
  }
1026
+
1027
+ const uint16_t * sc = (const uint16_t *) bq1->scales;
1028
+
1029
  iq1m_scale_t scale;
1030
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
1031
+ const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
1032
+
1033
+ const int tmp = sc[iqs/2] >> (6*(iqs%2));
1034
+ const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
1035
+ const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
1036
+ return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
1037
  }
1038
 
1039
+ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4) {
1040
+ const int q0_32 = (q4 >> 0) & 0x0F0F0F0F;
1041
+ const int8_t * q0_8 = (const int8_t *) &q0_32;
1042
+ const char4 val0_8 = make_char4(
1043
+ kvalues_iq4nl[q0_8[0]], kvalues_iq4nl[q0_8[1]], kvalues_iq4nl[q0_8[2]], kvalues_iq4nl[q0_8[3]]);
1044
+
1045
+ const int q1_32 = (q4 >> 4) & 0x0F0F0F0F;
1046
+ const int8_t * q1_8 = (const int8_t *) &q1_32;
1047
+ const char4 val1_8 = make_char4(
1048
+ kvalues_iq4nl[q1_8[0]], kvalues_iq4nl[q1_8[1]], kvalues_iq4nl[q1_8[2]], kvalues_iq4nl[q1_8[3]]);
1049
+
1050
+ return make_int2(*((const int *) &val0_8), *((const int *) &val1_8));
 
1051
  }
1052
+
1053
+ #define VDR_IQ4_NL_Q8_1_MMVQ 2
1054
 
1055
  static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
1056
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
1057
 
1058
+ const block_iq4_nl * bq4 = (const block_iq4_nl *) vbq + kbx;
1059
 
1060
+ const int * q8 = (const int *) bq8_1->qs + iqs;
 
 
1061
 
1062
+ int sumi = 0;
1063
+ #pragma unroll
 
 
1064
  for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
1065
+ const int aux_q4 = get_int_b2(bq4->qs, iqs + l);
1066
+ const int2 v = get_int_from_table_16(aux_q4);
 
 
 
 
 
 
 
1067
 
1068
+ sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
1069
+ sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
 
 
1070
  }
1071
+
1072
+ const float d = __half2float(bq4->d) * __low2float(bq8_1->ds);
1073
+ return d * sumi;
1074
  }
1075
 
1076
+ #define VDR_IQ4_XS_Q8_1_MMVQ 4
1077
+
1078
  static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
1079
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
1080
 
 
1081
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx;
1082
+
1083
+ int sumi = 0;
1084
+ #pragma unroll
 
 
 
 
 
 
 
1085
  for (int j = 0; j < 4; ++j) {
1086
+ const int aux_q4 = get_int_b4(bq4->qs, iqs + j);
1087
+ const int2 v = get_int_from_table_16(aux_q4);
1088
+
1089
+ const int u0 = get_int_b4(bq8_1[iqs/4].qs, j + 0);
1090
+ const int u1 = get_int_b4(bq8_1[iqs/4].qs, j + 4);
1091
+
1092
+ sumi = ggml_cuda_dp4a(v.x, u0, sumi);
1093
+ sumi = ggml_cuda_dp4a(v.y, u1, sumi);
1094
  }
1095
+
1096
+ const int ls = ((bq4->scales_l[iqs/8] >> (iqs & 0x04)) & 0x0F) | (((bq4->scales_h >> (iqs/2)) & 0x03) << 4);
1097
+ sumi *= ls - 32;
1098
+
1099
+ const float d = __half2float(bq4->d) * __low2float(bq8_1[iqs/4].ds);
1100
+ return d * sumi;
1101
  }