mgroeber9110 Marcus Groeber commited on
Commit
49e3343
·
1 Parent(s): a027c1d

ggml : portability fixes for VS 2017 (llama/12150)

Browse files

* Add include files for std::min/max and std::toupper/tolower

* win32: move _USE_MATH_DEFINES before includes to ensure M_PI is defined

* Use GGML_RESTRICT instead of "restrict" keyword everywhere, and use "__restrict" in MSVC plain C mode

* win32: only use __restrict in MSVC if C11/C17 support is not enabled

---------

Co-authored-by: Marcus Groeber <[email protected]>

ggml/include/ggml.h CHANGED
@@ -2140,7 +2140,11 @@ extern "C" {
2140
  # define GGML_RESTRICT
2141
  # endif
2142
  #else
2143
- # define GGML_RESTRICT restrict
 
 
 
 
2144
  #endif
2145
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2146
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
2140
  # define GGML_RESTRICT
2141
  # endif
2142
  #else
2143
+ # if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
2144
+ # define GGML_RESTRICT __restrict
2145
+ # else
2146
+ # define GGML_RESTRICT restrict
2147
+ # endif
2148
  #endif
2149
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2150
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
ggml/src/ggml-backend-reg.cpp CHANGED
@@ -8,6 +8,7 @@
8
  #include <string>
9
  #include <type_traits>
10
  #include <vector>
 
11
 
12
  #ifdef _WIN32
13
  # define WIN32_LEAN_AND_MEAN
 
8
  #include <string>
9
  #include <type_traits>
10
  #include <vector>
11
+ #include <cctype>
12
 
13
  #ifdef _WIN32
14
  # define WIN32_LEAN_AND_MEAN
ggml/src/ggml-backend.cpp CHANGED
@@ -21,6 +21,7 @@
21
  #include <string.h>
22
  #include <string>
23
  #include <vector>
 
24
 
25
  #ifdef __APPLE__
26
  #include <sys/types.h>
 
21
  #include <string.h>
22
  #include <string>
23
  #include <vector>
24
+ #include <algorithm>
25
 
26
  #ifdef __APPLE__
27
  #include <sys/types.h>
ggml/src/ggml-cpu/ggml-cpu-quants.c CHANGED
@@ -719,28 +719,28 @@ static inline __m128i packNibbles( __m256i bytes ) {
719
  }
720
  #endif //__loongarch_asx
721
 
722
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
723
  quantize_row_q4_0_ref(x, y, k);
724
  }
725
 
726
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
727
  quantize_row_q4_1_ref(x, y, k);
728
  }
729
 
730
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
731
  quantize_row_q5_0_ref(x, y, k);
732
  }
733
 
734
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
735
  quantize_row_q5_1_ref(x, y, k);
736
  }
737
 
738
- void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
739
  assert(QK8_0 == 32);
740
  assert(k % QK8_0 == 0);
741
  const int nb = k / QK8_0;
742
 
743
- block_q8_0 * restrict y = vy;
744
 
745
  #if defined(__ARM_NEON)
746
  for (int i = 0; i < nb; i++) {
@@ -1050,11 +1050,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1050
  #endif
1051
  }
1052
 
1053
- void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
1054
  assert(k % QK8_1 == 0);
1055
  const int nb = k / QK8_1;
1056
 
1057
- block_q8_1 * restrict y = vy;
1058
 
1059
  #if defined(__ARM_NEON)
1060
  for (int i = 0; i < nb; i++) {
@@ -1428,8 +1428,8 @@ static inline int nearest_int(float fval) {
1428
  return (i & 0x007fffff) - 0x00400000;
1429
  }
1430
 
1431
- static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
1432
- const float * restrict qw) {
1433
  float max = 0;
1434
  float amax = 0;
1435
  for (int i = 0; i < n; ++i) {
@@ -1497,7 +1497,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1497
  return scale;
1498
  }
1499
 
1500
- static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
1501
  float max = 0;
1502
  float amax = 0;
1503
  for (int i = 0; i < n; ++i) {
@@ -1556,7 +1556,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
1556
  return 1/iscale;
1557
  }
1558
 
1559
- static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
1560
  int ntry, float alpha) {
1561
  float min = x[0];
1562
  float max = x[0];
@@ -1599,8 +1599,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
1599
  return scale;
1600
  }
1601
 
1602
- static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1603
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1604
  float rmin, float rdelta, int nstep, bool use_mad) {
1605
  float min = x[0];
1606
  float max = x[0];
@@ -1680,7 +1680,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
1680
  return scale;
1681
  }
1682
 
1683
- static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
1684
  if (j < 4) {
1685
  *d = q[j] & 63; *m = q[j + 4] & 63;
1686
  } else {
@@ -1691,51 +1691,51 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
1691
 
1692
  //========================- 2-bit (de)-quantization
1693
 
1694
- void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
1695
  quantize_row_q2_K_ref(x, vy, k);
1696
  }
1697
 
1698
  //========================= 3-bit (de)-quantization
1699
 
1700
- void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
1701
  quantize_row_q3_K_ref(x, vy, k);
1702
  }
1703
 
1704
  // ====================== 4-bit (de)-quantization
1705
 
1706
- void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
1707
  assert(k % QK_K == 0);
1708
- block_q4_K * restrict y = vy;
1709
  quantize_row_q4_K_ref(x, y, k);
1710
  }
1711
 
1712
  // ====================== 5-bit (de)-quantization
1713
 
1714
- void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
1715
  assert(k % QK_K == 0);
1716
- block_q5_K * restrict y = vy;
1717
  quantize_row_q5_K_ref(x, y, k);
1718
  }
1719
 
1720
  // ====================== 6-bit (de)-quantization
1721
 
1722
- void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
1723
  assert(k % QK_K == 0);
1724
- block_q6_K * restrict y = vy;
1725
  quantize_row_q6_K_ref(x, y, k);
1726
  }
1727
 
1728
  // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
1729
 
1730
- void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
1731
  assert(k % QK_K == 0);
1732
- block_tq1_0 * restrict y = vy;
1733
  quantize_row_tq1_0_ref(x, y, k);
1734
  }
1735
 
1736
- void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
1737
  assert(k % QK_K == 0);
1738
- block_tq2_0 * restrict y = vy;
1739
  quantize_row_tq2_0_ref(x, y, k);
1740
  }
1741
 
@@ -1743,11 +1743,11 @@ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -1
1743
 
1744
  //===================================== Q8_K ==============================================
1745
 
1746
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
1747
  #ifdef __wasm_simd128__
1748
  assert(k % QK_K == 0);
1749
  const int64_t nb = k / QK_K;
1750
- block_q8_K * restrict yc = y; // Cast to proper type
1751
 
1752
  for (int i = 0; i < nb; i++) {
1753
  const float * x_block = x + i * QK_K;
@@ -1909,7 +1909,7 @@ static inline __m128i get_scale_shuffle(int i) {
1909
  }
1910
  #endif
1911
 
1912
- void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
1913
  const int qk = QK8_0;
1914
  const int nb = n / qk;
1915
 
@@ -1924,23 +1924,23 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1924
  UNUSED(by);
1925
  UNUSED(bs);
1926
 
1927
- const block_q4_0 * restrict x = vx;
1928
- const block_q8_0 * restrict y = vy;
1929
 
1930
  #if defined(__ARM_FEATURE_MATMUL_INT8)
1931
  if (nrc == 2) {
1932
- const block_q4_0 * restrict vx0 = vx;
1933
- const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
1934
- const block_q8_0 * restrict vy0 = vy;
1935
- const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
1936
 
1937
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
1938
 
1939
  for (int i = 0; i < nb; i++) {
1940
- const block_q4_0 * restrict b_x0 = &vx0[i];
1941
- const block_q4_0 * restrict b_x1 = &vx1[i];
1942
- const block_q8_0 * restrict b_y0 = &vy0[i];
1943
- const block_q8_0 * restrict b_y1 = &vy1[i];
1944
 
1945
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
1946
  const int8x16_t s8b = vdupq_n_s8(0x8);
@@ -2017,10 +2017,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2017
  const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
2018
 
2019
  for (; ib + 1 < nb; ib += 2) {
2020
- const block_q4_0 * restrict x0 = &x[ib + 0];
2021
- const block_q4_0 * restrict x1 = &x[ib + 1];
2022
- const block_q8_0 * restrict y0 = &y[ib + 0];
2023
- const block_q8_0 * restrict y1 = &y[ib + 1];
2024
 
2025
  // load x
2026
  const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
@@ -2063,10 +2063,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2063
  const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
2064
 
2065
  for (; ib + 1 < nb; ib += 2) {
2066
- const block_q4_0 * restrict x0 = &x[ib + 0];
2067
- const block_q4_0 * restrict x1 = &x[ib + 1];
2068
- const block_q8_0 * restrict y0 = &y[ib + 0];
2069
- const block_q8_0 * restrict y1 = &y[ib + 1];
2070
 
2071
  // load x
2072
  const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
@@ -2104,10 +2104,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2104
  const svbool_t pl16 = svnot_b_z(ph32, ph16);
2105
 
2106
  for (; ib + 1 < nb; ib += 2) {
2107
- const block_q4_0 * restrict x0 = &x[ib + 0];
2108
- const block_q4_0 * restrict x1 = &x[ib + 1];
2109
- const block_q8_0 * restrict y0 = &y[ib + 0];
2110
- const block_q8_0 * restrict y1 = &y[ib + 1];
2111
 
2112
  // load x
2113
  const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
@@ -2144,10 +2144,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2144
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
2145
 
2146
  for (; ib + 1 < nb; ib += 2) {
2147
- const block_q4_0 * restrict x0 = &x[ib + 0];
2148
- const block_q4_0 * restrict x1 = &x[ib + 1];
2149
- const block_q8_0 * restrict y0 = &y[ib + 0];
2150
- const block_q8_0 * restrict y1 = &y[ib + 1];
2151
 
2152
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2153
  const int8x16_t s8b = vdupq_n_s8(0x8);
@@ -2189,10 +2189,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2189
  const v128_t s8b = wasm_i8x16_splat(0x8);
2190
 
2191
  for (; ib + 1 < nb; ib += 2) {
2192
- const block_q4_0 * restrict x0 = &x[ib];
2193
- const block_q4_0 * restrict x1 = &x[ib + 1];
2194
- const block_q8_0 * restrict y0 = &y[ib];
2195
- const block_q8_0 * restrict y1 = &y[ib + 1];
2196
 
2197
  // Load and process x0
2198
  v128_t v0_0 = wasm_v128_load(x0->qs);
@@ -2609,7 +2609,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2609
  *s = sumf;
2610
  }
2611
 
2612
- void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
2613
  const int qk = QK8_1;
2614
  const int nb = n / qk;
2615
 
@@ -2624,24 +2624,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2624
  UNUSED(by);
2625
  UNUSED(bs);
2626
 
2627
- const block_q4_1 * restrict x = vx;
2628
- const block_q8_1 * restrict y = vy;
2629
 
2630
  #if defined(__ARM_FEATURE_MATMUL_INT8)
2631
  if (nrc == 2) {
2632
- const block_q4_1 * restrict vx0 = vx;
2633
- const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
2634
- const block_q8_1 * restrict vy0 = vy;
2635
- const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
2636
 
2637
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2638
  float32x4_t summs0 = vdupq_n_f32(0.0f);
2639
 
2640
  for (int i = 0; i < nb; i++) {
2641
- const block_q4_1 * restrict b_x0 = &vx0[i];
2642
- const block_q4_1 * restrict b_x1 = &vx1[i];
2643
- const block_q8_1 * restrict b_y0 = &vy0[i];
2644
- const block_q8_1 * restrict b_y1 = &vy1[i];
2645
 
2646
  float32_t summs_t[4] = {
2647
  GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
@@ -2715,10 +2715,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2715
  float summs = 0;
2716
 
2717
  for (; ib + 1 < nb; ib += 2) {
2718
- const block_q4_1 * restrict x0 = &x[ib + 0];
2719
- const block_q4_1 * restrict x1 = &x[ib + 1];
2720
- const block_q8_1 * restrict y0 = &y[ib + 0];
2721
- const block_q8_1 * restrict y1 = &y[ib + 1];
2722
 
2723
  summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
2724
 
@@ -2931,7 +2931,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2931
  *s = sumf;
2932
  }
2933
 
2934
- void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
2935
  const int qk = QK8_0;
2936
  const int nb = n / qk;
2937
 
@@ -2946,8 +2946,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2946
  UNUSED(by);
2947
  UNUSED(bs);
2948
 
2949
- const block_q5_0 * restrict x = vx;
2950
- const block_q8_0 * restrict y = vy;
2951
 
2952
  #if defined(__ARM_NEON)
2953
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2960,10 +2960,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2960
  uint64_t tmp1[4];
2961
 
2962
  for (; ib + 1 < nb; ib += 2) {
2963
- const block_q5_0 * restrict x0 = &x[ib];
2964
- const block_q5_0 * restrict x1 = &x[ib + 1];
2965
- const block_q8_0 * restrict y0 = &y[ib];
2966
- const block_q8_0 * restrict y1 = &y[ib + 1];
2967
 
2968
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2969
 
@@ -3024,8 +3024,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3024
 
3025
  // TODO: check if unrolling this is better
3026
  for (; ib < nb; ++ib) {
3027
- const block_q5_0 * restrict x0 = &x[ib];
3028
- const block_q8_0 * restrict y0 = &y[ib];
3029
 
3030
  const v128_t m4b = wasm_i8x16_splat(0x0F);
3031
 
@@ -3286,7 +3286,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3286
  *s = sumf;
3287
  }
3288
 
3289
- void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3290
  const int qk = QK8_1;
3291
  const int nb = n / qk;
3292
 
@@ -3301,8 +3301,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3301
  UNUSED(by);
3302
  UNUSED(bs);
3303
 
3304
- const block_q5_1 * restrict x = vx;
3305
- const block_q8_1 * restrict y = vy;
3306
 
3307
  #if defined(__ARM_NEON)
3308
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3318,10 +3318,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3318
  uint64_t tmp1[4];
3319
 
3320
  for (; ib + 1 < nb; ib += 2) {
3321
- const block_q5_1 * restrict x0 = &x[ib];
3322
- const block_q5_1 * restrict x1 = &x[ib + 1];
3323
- const block_q8_1 * restrict y0 = &y[ib];
3324
- const block_q8_1 * restrict y1 = &y[ib + 1];
3325
 
3326
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
3327
 
@@ -3387,8 +3387,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3387
 
3388
  // TODO: check if unrolling this is better
3389
  for (; ib < nb; ++ib) {
3390
- const block_q5_1 * restrict x0 = &x[ib];
3391
- const block_q8_1 * restrict y0 = &y[ib];
3392
 
3393
  summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
3394
 
@@ -3660,7 +3660,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3660
  *s = sumf;
3661
  }
3662
 
3663
- void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3664
  const int qk = QK8_0;
3665
  const int nb = n / qk;
3666
 
@@ -3675,24 +3675,24 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3675
  UNUSED(by);
3676
  UNUSED(bs);
3677
 
3678
- const block_q8_0 * restrict x = vx;
3679
- const block_q8_0 * restrict y = vy;
3680
 
3681
  #if defined(__ARM_FEATURE_MATMUL_INT8)
3682
  if (nrc == 2) {
3683
- const block_q8_0 * restrict vx0 = vx;
3684
- const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
3685
- const block_q8_0 * restrict vy0 = vy;
3686
- const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
3687
 
3688
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3689
 
3690
  for (int i = 0; i < nb; i++) {
3691
- const block_q8_0 * restrict b_x0 = &vx0[i];
3692
- const block_q8_0 * restrict b_y0 = &vy0[i];
3693
 
3694
- const block_q8_0 * restrict b_x1 = &vx1[i];
3695
- const block_q8_0 * restrict b_y1 = &vy1[i];
3696
 
3697
  const int8x16_t x0_l = vld1q_s8(b_x0->qs);
3698
  const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
@@ -3757,10 +3757,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3757
  const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
3758
 
3759
  for (; ib + 1 < nb; ib += 2) {
3760
- const block_q8_0 * restrict x0 = &x[ib + 0];
3761
- const block_q8_0 * restrict x1 = &x[ib + 1];
3762
- const block_q8_0 * restrict y0 = &y[ib + 0];
3763
- const block_q8_0 * restrict y1 = &y[ib + 1];
3764
 
3765
  // load x
3766
  const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
@@ -3788,10 +3788,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3788
  {
3789
  //printf("sve256");
3790
  for (; ib + 1 < nb; ib += 2) {
3791
- const block_q8_0 * restrict x0 = &x[ib + 0];
3792
- const block_q8_0 * restrict x1 = &x[ib + 1];
3793
- const block_q8_0 * restrict y0 = &y[ib + 0];
3794
- const block_q8_0 * restrict y1 = &y[ib + 1];
3795
 
3796
  // load x
3797
  const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
@@ -3824,10 +3824,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3824
  svfloat32_t sumv00 = svdup_n_f32(0.0f);
3825
 
3826
  for (; ib + 1 < nb; ib += 2) {
3827
- const block_q8_0 * restrict x0 = &x[ib + 0];
3828
- const block_q8_0 * restrict x1 = &x[ib + 1];
3829
- const block_q8_0 * restrict y0 = &y[ib + 0];
3830
- const block_q8_0 * restrict y1 = &y[ib + 1];
3831
 
3832
  //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
3833
  // and add them to make one 64 element vector
@@ -3867,10 +3867,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3867
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3868
 
3869
  for (; ib + 1 < nb; ib += 2) {
3870
- const block_q8_0 * restrict x0 = &x[ib + 0];
3871
- const block_q8_0 * restrict x1 = &x[ib + 1];
3872
- const block_q8_0 * restrict y0 = &y[ib + 0];
3873
- const block_q8_0 * restrict y1 = &y[ib + 1];
3874
 
3875
  const int8x16_t x0_0 = vld1q_s8(x0->qs);
3876
  const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
@@ -3897,8 +3897,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3897
  v128_t sumv = wasm_f32x4_splat(0.0f);
3898
 
3899
  for (; ib < nb; ++ib) {
3900
- const block_q8_0 * restrict x0 = &x[ib];
3901
- const block_q8_0 * restrict y0 = &y[ib];
3902
 
3903
  const v128_t x0_0 = wasm_v128_load(x0->qs);
3904
  const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
@@ -4080,15 +4080,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4080
  *s = sumf;
4081
  }
4082
 
4083
- void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4084
  assert(nrc == 1);
4085
  UNUSED(nrc);
4086
  UNUSED(bx);
4087
  UNUSED(by);
4088
  UNUSED(bs);
4089
 
4090
- const block_tq1_0 * restrict x = vx;
4091
- const block_q8_K * restrict y = vy;
4092
 
4093
  const int nb = n / QK_K;
4094
 
@@ -4403,15 +4403,15 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
4403
  #endif
4404
  }
4405
 
4406
- void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4407
  assert(nrc == 1);
4408
  UNUSED(nrc);
4409
  UNUSED(bx);
4410
  UNUSED(by);
4411
  UNUSED(bs);
4412
 
4413
- const block_tq2_0 * restrict x = vx;
4414
- const block_q8_K * restrict y = vy;
4415
 
4416
  const int nb = n / QK_K;
4417
 
@@ -4575,15 +4575,15 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void *
4575
  #endif
4576
  }
4577
 
4578
- void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4579
  assert(nrc == 1);
4580
  UNUSED(nrc);
4581
  UNUSED(bx);
4582
  UNUSED(by);
4583
  UNUSED(bs);
4584
 
4585
- const block_q2_K * restrict x = vx;
4586
- const block_q8_K * restrict y = vy;
4587
 
4588
  const int nb = n / QK_K;
4589
 
@@ -4603,9 +4603,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4603
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4604
  svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
4605
 
4606
- const uint8_t * restrict q2 = x[i].qs;
4607
- const int8_t * restrict q8_sv = y[i].qs;
4608
- const uint8_t * restrict sc = x[i].scales;
4609
 
4610
  svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
4611
  const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
@@ -4748,9 +4748,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4748
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4749
  svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
4750
 
4751
- const uint8_t * restrict q2 = x[i].qs;
4752
- const int8_t * restrict q8_sv = y[i].qs;
4753
- const uint8_t * restrict sc = x[i].scales;
4754
 
4755
  const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
4756
  const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
@@ -4847,9 +4847,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4847
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4848
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4849
 
4850
- const uint8_t * restrict q2 = x[i].qs;
4851
- const int8_t * restrict q8 = y[i].qs;
4852
- const uint8_t * restrict sc = x[i].scales;
4853
 
4854
  const uint8x16_t mins_and_scales = vld1q_u8(sc);
4855
  const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
@@ -4912,8 +4912,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4912
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4913
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4914
 
4915
- const uint8_t * restrict q2 = x[i].qs;
4916
- const int8_t * restrict q8 = y[i].qs;
4917
 
4918
  const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
4919
  const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
@@ -4979,8 +4979,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4979
  const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4980
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4981
 
4982
- const uint8_t * restrict q2 = x[i].qs;
4983
- const int8_t * restrict q8 = y[i].qs;
4984
 
4985
  // load mins and scales from block_q2_K.scales[QK_K/16]
4986
  const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
@@ -5306,8 +5306,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5306
  vector signed int vsumi6 = v0;
5307
  vector signed int vsumi7 = v0;
5308
 
5309
- const uint8_t * restrict q2 = x[i].qs;
5310
- const int8_t * restrict q8 = y[i].qs;
5311
 
5312
  for (int j = 0; j < QK_K/128; ++j) {
5313
  __builtin_prefetch(q2, 0, 1);
@@ -5398,8 +5398,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5398
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5399
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5400
 
5401
- const uint8_t * restrict q2 = x[i].qs;
5402
- const int8_t * restrict q8 = y[i].qs;
5403
 
5404
  const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
5405
  const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
@@ -5492,7 +5492,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5492
  #endif
5493
  }
5494
 
5495
- void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5496
  assert(n % QK_K == 0);
5497
  assert(nrc == 1);
5498
  UNUSED(nrc);
@@ -5503,8 +5503,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5503
  const uint32_t kmask1 = 0x03030303;
5504
  const uint32_t kmask2 = 0x0f0f0f0f;
5505
 
5506
- const block_q3_K * restrict x = vx;
5507
- const block_q8_K * restrict y = vy;
5508
 
5509
  const int nb = n / QK_K;
5510
 
@@ -5529,9 +5529,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5529
 
5530
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5531
 
5532
- const uint8_t * restrict q3_sv = x[i].qs;
5533
- const uint8_t * restrict qh_sv = x[i].hmask;
5534
- const int8_t * restrict q8_sv = y[i].qs;
5535
 
5536
  // Set up scales
5537
  memcpy(aux, x[i].scales, 12);
@@ -5705,9 +5705,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5705
 
5706
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5707
 
5708
- const uint8_t * restrict q3 = x[i].qs;
5709
- const uint8_t * restrict qh = x[i].hmask;
5710
- const int8_t * restrict q8 = y[i].qs;
5711
 
5712
  ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
5713
 
@@ -5791,8 +5791,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5791
 
5792
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5793
 
5794
- const uint8_t * restrict q3 = x[i].qs;
5795
- const int8_t * restrict q8 = y[i].qs;
5796
 
5797
  // Set up scales
5798
  memcpy(aux, x[i].scales, 12);
@@ -5896,8 +5896,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5896
 
5897
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5898
 
5899
- const uint8_t * restrict q3 = x[i].qs;
5900
- const int8_t * restrict q8 = y[i].qs;
5901
 
5902
  // Set up scales
5903
  aux = (const uint32_t *)x[i].scales;
@@ -6030,9 +6030,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6030
 
6031
  float sumf = 0;
6032
  for (int i = 0; i < nb; ++i) {
6033
- const uint8_t * restrict q3 = x[i].qs;
6034
- const uint8_t * restrict hm = x[i].hmask;
6035
- const int8_t * restrict q8 = y[i].qs;
6036
 
6037
  // Process blocks with SIMD
6038
  int8_t * a = aux8;
@@ -6119,9 +6119,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6119
  float sumf = 0;
6120
  for (int i = 0; i < nb; ++i) {
6121
 
6122
- const uint8_t * restrict q3 = x[i].qs;
6123
- const uint8_t * restrict qh = x[i].hmask;
6124
- const int8_t * restrict q8 = y[i].qs;
6125
 
6126
  memcpy(aux, x[i].scales, 12);
6127
  utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
@@ -6261,8 +6261,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6261
  vector signed int vsumi6 = v0;
6262
  vector signed int vsumi7 = v0;
6263
 
6264
- const uint8_t * restrict q3 = x[i].qs;
6265
- const int8_t * restrict q8 = y[i].qs;
6266
 
6267
  for (int j = 0; j < QK_K/128; ++j) {
6268
  __builtin_prefetch(q3, 0, 1);
@@ -6375,8 +6375,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6375
  for (int i = 0; i < nb; ++i) {
6376
 
6377
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6378
- const uint8_t * restrict q3 = x[i].qs;
6379
- const int8_t * restrict q8 = y[i].qs;
6380
  // Set up scales
6381
  memcpy(aux, x[i].scales, 12);
6382
  __m128i scales128 = lsx_set_w(
@@ -6461,11 +6461,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6461
 
6462
  float sumf = 0;
6463
  for (int i = 0; i < nb; ++i) {
6464
- const uint8_t * restrict q3 = x[i].qs;
6465
- const uint8_t * restrict hm = x[i].hmask;
6466
- const int8_t * restrict q8 = y[i].qs;
6467
  memset(aux32, 0, 8*sizeof(int32_t));
6468
- int8_t * restrict a = aux8;
6469
  uint8_t m = 1;
6470
  for (int j = 0; j < QK_K; j += 128) {
6471
  for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
@@ -6508,7 +6508,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6508
 
6509
  }
6510
 
6511
- void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
6512
  assert(n % QK_K == 0);
6513
  assert(nrc == 1);
6514
  UNUSED(nrc);
@@ -6516,8 +6516,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6516
  UNUSED(by);
6517
  UNUSED(bs);
6518
 
6519
- const block_q4_K * restrict x = vx;
6520
- const block_q8_K * restrict y = vy;
6521
 
6522
  const int nb = n / QK_K;
6523
 
@@ -6552,8 +6552,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6552
 
6553
  const uint8_t * scales = (const uint8_t *)utmp;
6554
 
6555
- const uint8_t * restrict q4 = x[i].qs;
6556
- const int8_t * restrict q8 = y[i].qs;
6557
 
6558
  const int vector_length = ggml_cpu_get_sve_cnt()*8;
6559
  const svuint8_t m4b = svdup_n_u8(0xf);
@@ -6640,8 +6640,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6640
 
6641
  const uint8_t * scales = (const uint8_t *)utmp;
6642
 
6643
- const uint8_t * restrict q4 = x[i].qs;
6644
- const int8_t * restrict q8 = y[i].qs;
6645
 
6646
  int32_t sumi1 = 0;
6647
  int32_t sumi2 = 0;
@@ -6679,8 +6679,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6679
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6680
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
6681
 
6682
- const uint8_t * restrict q4 = x[i].qs;
6683
- const int8_t * restrict q8 = y[i].qs;
6684
 
6685
  // Process scales and mins
6686
  memcpy(utmp, x[i].scales, 12);
@@ -6692,7 +6692,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6692
 
6693
  // Sum mins * q8sums
6694
  int32_t sumi = 0;
6695
- const int16_t * restrict q8sums = y[i].bsums;
6696
  const uint8_t * m = (const uint8_t *)&utmp[2];
6697
  for (int j = 0; j < 16; j += 2) {
6698
  sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
@@ -6791,8 +6791,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6791
  utmp[2] = uaux;
6792
  utmp[0] &= kmask1;
6793
 
6794
- const uint8_t * restrict q4 = x[i].qs;
6795
- const int8_t * restrict q8 = y[i].qs;
6796
 
6797
  const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
6798
 
@@ -6850,8 +6850,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6850
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6851
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
6852
 
6853
- const uint8_t * restrict q4 = x[i].qs;
6854
- const int8_t * restrict q8 = y[i].qs;
6855
 
6856
  memcpy(utmp, x[i].scales, 12);
6857
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -6951,8 +6951,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6951
  vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
6952
  sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
6953
 
6954
- const uint8_t * restrict q4 = x[i].qs;
6955
- const int8_t * restrict q8 = y[i].qs;
6956
 
6957
  vl = 32;
6958
 
@@ -7053,8 +7053,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7053
  vector signed int vsumi2 = v0;
7054
  vector signed int vsumi3 = v0;
7055
 
7056
- const uint8_t * restrict q4 = x[i].qs;
7057
- const int8_t * restrict q8 = y[i].qs;
7058
 
7059
  for (int j = 0; j < QK_K/64; j+=2) {
7060
  __builtin_prefetch(q4, 0, 1);
@@ -7145,8 +7145,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7145
  utmp[2] = uaux;
7146
  utmp[0] &= kmask1;
7147
 
7148
- const uint8_t * restrict q4 = x[i].qs;
7149
- const int8_t * restrict q8 = y[i].qs;
7150
 
7151
  const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
7152
  const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
@@ -7228,8 +7228,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7228
  sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
7229
 
7230
  const uint8_t * scales = (const uint8_t *)utmp;
7231
- const uint8_t * restrict x0 = x[i].qs;
7232
- const int8_t * restrict y0 = y[i].qs;
7233
 
7234
  int32_t sumi1 = 0;
7235
  int32_t sumi2 = 0;
@@ -7277,10 +7277,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7277
 
7278
  float sumf = 0;
7279
  for (int i = 0; i < nb; ++i) {
7280
- const uint8_t * restrict q4 = x[i].qs;
7281
- const int8_t * restrict q8 = y[i].qs;
7282
  memset(aux32, 0, 8*sizeof(int32_t));
7283
- int8_t * restrict a = aux8;
7284
  for (int j = 0; j < QK_K/64; ++j) {
7285
  for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
7286
  a += 32;
@@ -7323,7 +7323,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7323
  #endif
7324
  }
7325
 
7326
- void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
7327
  assert(n % QK_K == 0);
7328
  assert(nrc == 1);
7329
  UNUSED(nrc);
@@ -7331,8 +7331,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7331
  UNUSED(by);
7332
  UNUSED(bs);
7333
 
7334
- const block_q5_K * restrict x = vx;
7335
- const block_q8_K * restrict y = vy;
7336
 
7337
  const int nb = n / QK_K;
7338
 
@@ -7374,9 +7374,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7374
 
7375
  const uint8_t * scales = (const uint8_t *)utmp;
7376
 
7377
- const uint8_t * restrict q5 = x[i].qs;
7378
- const uint8_t * restrict qh = x[i].qh;
7379
- const int8_t * restrict q8 = y[i].qs;
7380
 
7381
  ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
7382
 
@@ -7421,8 +7421,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7421
  float summs = 0.f;
7422
 
7423
  for (int i = 0; i < nb; ++i) {
7424
- const uint8_t * restrict q5 = x[i].qs;
7425
- const int8_t * restrict q8 = y[i].qs;
7426
 
7427
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7428
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@@ -7505,8 +7505,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7505
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7506
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
7507
 
7508
- const uint8_t * restrict q5 = x[i].qs;
7509
- const int8_t * restrict q8 = y[i].qs;
7510
 
7511
  memcpy(utmp, x[i].scales, 12);
7512
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -7597,9 +7597,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7597
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7598
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
7599
 
7600
- const uint8_t * restrict q5 = x[i].qs;
7601
- const uint8_t * restrict qh = x[i].qh;
7602
- const int8_t * restrict q8 = y[i].qs;
7603
 
7604
  // Process scales and mins
7605
  memcpy(utmp, x[i].scales, 12);
@@ -7611,7 +7611,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7611
 
7612
  // Sum mins * q8sums
7613
  int32_t sumi_mins = 0;
7614
- const int16_t * restrict q8sums = y[i].bsums;
7615
  const uint8_t * m = (const uint8_t *)&utmp[2];
7616
  for (int j = 0; j < 16; j += 2) {
7617
  sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
@@ -7715,9 +7715,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7715
 
7716
  vl = 8;
7717
 
7718
- const uint8_t * restrict q5 = x[i].qs;
7719
- const uint8_t * restrict hm = x[i].qh;
7720
- const int8_t * restrict q8 = y[i].qs;
7721
 
7722
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7723
  const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
@@ -7856,8 +7856,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7856
  vector signed int vsumi2 = v0;
7857
  vector signed int vsumi3 = v0;
7858
 
7859
- const uint8_t * restrict q5 = x[i].qs;
7860
- const int8_t * restrict q8 = y[i].qs;
7861
 
7862
  for (int j = 0; j < QK_K/64; ++j) {
7863
  __builtin_prefetch(q5, 0, 1);
@@ -7929,8 +7929,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7929
 
7930
  for (int i = 0; i < nb; ++i) {
7931
 
7932
- const uint8_t * restrict q5 = x[i].qs;
7933
- const int8_t * restrict q8 = y[i].qs;
7934
 
7935
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7936
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@@ -8039,9 +8039,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8039
  const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
8040
 
8041
  const uint8_t * scales = (const uint8_t *)utmp;
8042
- const uint8_t * restrict x0l = x[i].qs;
8043
- const uint8_t * restrict x0h = x[i].qh;
8044
- const int8_t * restrict y0 = y[i].qs;
8045
 
8046
  v_xh[0] = vec_xl(0 , x0h);
8047
  v_xh[1] = vec_xl(16, x0h);
@@ -8094,11 +8094,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8094
 
8095
  float sumf = 0;
8096
  for (int i = 0; i < nb; ++i) {
8097
- const uint8_t * restrict q4 = x[i].qs;
8098
- const uint8_t * restrict hm = x[i].qh;
8099
- const int8_t * restrict q8 = y[i].qs;
8100
  memset(aux32, 0, 8*sizeof(int32_t));
8101
- int8_t * restrict a = aux8;
8102
  uint8_t m = 1;
8103
  for (int j = 0; j < QK_K/64; ++j) {
8104
  for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
@@ -8145,7 +8145,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8145
  #endif
8146
  }
8147
 
8148
- void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8149
  assert(n % QK_K == 0);
8150
  assert(nrc == 1);
8151
  UNUSED(nrc);
@@ -8153,8 +8153,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8153
  UNUSED(by);
8154
  UNUSED(bs);
8155
 
8156
- const block_q6_K * restrict x = vx;
8157
- const block_q8_K * restrict y = vy;
8158
 
8159
  const int nb = n / QK_K;
8160
 
@@ -8174,11 +8174,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8174
 
8175
  const float d_all = GGML_FP16_TO_FP32(x[i].d);
8176
 
8177
- const uint8_t * restrict q6 = x[i].ql;
8178
- const uint8_t * restrict qh = x[i].qh;
8179
- const int8_t * restrict q8 = y[i].qs;
8180
 
8181
- const int8_t * restrict scale = x[i].scales;
8182
 
8183
  const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
8184
  const int8x16_t scales = vld1q_s8(scale);
@@ -8265,9 +8265,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8265
 
8266
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8267
 
8268
- const uint8_t * restrict q4 = x[i].ql;
8269
- const uint8_t * restrict qh = x[i].qh;
8270
- const int8_t * restrict q8 = y[i].qs;
8271
 
8272
  const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
8273
 
@@ -8343,9 +8343,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8343
 
8344
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8345
 
8346
- const uint8_t * restrict q4 = x[i].ql;
8347
- const uint8_t * restrict qh = x[i].qh;
8348
- const int8_t * restrict q8 = y[i].qs;
8349
 
8350
  // handle the q6_k -32 offset separately using bsums
8351
  const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
@@ -8444,8 +8444,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8444
 
8445
  for (int i = 0; i < nb; ++i) {
8446
  // Unpack 6-bit quantized data into aux8 (unchanged)
8447
- const uint8_t * restrict q4 = x[i].ql;
8448
- const uint8_t * restrict qh = x[i].qh;
8449
  int8_t * a = aux8;
8450
  for (int j = 0; j < QK_K; j += 128) {
8451
  for (int l = 0; l < 32; ++l) {
@@ -8459,8 +8459,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8459
  qh += 32;
8460
  }
8461
 
8462
- const int8_t * restrict a_ptr = aux8;
8463
- const int8_t * restrict q8 = y[i].qs;
8464
  v128_t acc0 = wasm_i32x4_splat(0);
8465
  v128_t acc1 = wasm_i32x4_splat(0);
8466
 
@@ -8523,11 +8523,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8523
 
8524
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8525
 
8526
- const uint8_t * restrict q6 = x[i].ql;
8527
- const uint8_t * restrict qh = x[i].qh;
8528
- const int8_t * restrict q8 = y[i].qs;
8529
 
8530
- const int8_t * restrict scale = x[i].scales;
8531
 
8532
  size_t vl;
8533
 
@@ -8629,10 +8629,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8629
  vector signed int vsumi6 = v0;
8630
  vector signed int vsumi7 = v0;
8631
 
8632
- const uint8_t * restrict q6 = x[i].ql;
8633
- const uint8_t * restrict qh = x[i].qh;
8634
- const int8_t * restrict qs = x[i].scales;
8635
- const int8_t * restrict q8 = y[i].qs;
8636
 
8637
  for (int j = 0; j < QK_K/128; ++j) {
8638
  __builtin_prefetch(q6, 0, 0);
@@ -8748,9 +8748,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8748
 
8749
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8750
 
8751
- const uint8_t * restrict q4 = x[i].ql;
8752
- const uint8_t * restrict qh = x[i].qh;
8753
- const int8_t * restrict q8 = y[i].qs;
8754
 
8755
  const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
8756
  const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
@@ -8816,11 +8816,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8816
  for (int i = 0; i < nb; ++i) {
8817
  const float d_all = GGML_FP16_TO_FP32(x[i].d);
8818
 
8819
- const uint8_t * restrict x0l = x[i].ql;
8820
- const uint8_t * restrict x0h = x[i].qh;
8821
- const int8_t * restrict y0 = y[i].qs;
8822
 
8823
- const int8_t * restrict scale = x[i].scales;
8824
 
8825
  const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
8826
  const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
@@ -8931,11 +8931,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8931
 
8932
  float sumf = 0;
8933
  for (int i = 0; i < nb; ++i) {
8934
- const uint8_t * restrict q4 = x[i].ql;
8935
- const uint8_t * restrict qh = x[i].qh;
8936
- const int8_t * restrict q8 = y[i].qs;
8937
  memset(aux32, 0, 8*sizeof(int32_t));
8938
- int8_t * restrict a = aux8;
8939
  for (int j = 0; j < QK_K; j += 128) {
8940
  for (int l = 0; l < 32; ++l) {
8941
  a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
@@ -9003,7 +9003,7 @@ static const int8_t keven_signs_q2xs[1024] = {
9003
  };
9004
  #endif
9005
 
9006
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9007
  assert(n % QK_K == 0);
9008
  assert(nrc == 1);
9009
  UNUSED(nrc);
@@ -9011,8 +9011,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9011
  UNUSED(by);
9012
  UNUSED(bs);
9013
 
9014
- const block_iq2_xxs * restrict x = vx;
9015
- const block_q8_K * restrict y = vy;
9016
 
9017
  const int nb = n / QK_K;
9018
 
@@ -9030,8 +9030,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9030
  float sumf = 0;
9031
  for (int i = 0; i < nb; ++i) {
9032
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9033
- const uint16_t * restrict q2 = x[i].qs;
9034
- const int8_t * restrict q8 = y[i].qs;
9035
  float sumf1 = 0, sumf2 = 0;
9036
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9037
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
@@ -9067,8 +9067,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9067
  __m256 accumf = _mm256_setzero_ps();
9068
  for (int i = 0; i < nb; ++i) {
9069
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9070
- const uint16_t * restrict q2 = x[i].qs;
9071
- const int8_t * restrict q8 = y[i].qs;
9072
  __m256i sumi1 = _mm256_setzero_si256();
9073
  __m256i sumi2 = _mm256_setzero_si256();
9074
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9108,8 +9108,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9108
  __m256 accumf = _mm256_setzero_ps();
9109
  for (int i = 0; i < nb; ++i) {
9110
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9111
- const uint16_t * restrict q2 = x[i].qs;
9112
- const int8_t * restrict q8 = y[i].qs;
9113
  __m128i sumi1_0 = _mm_setzero_si128();
9114
  __m128i sumi1_1 = _mm_setzero_si128();
9115
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -9173,8 +9173,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9173
  vector signed int vsumi2 = v0;
9174
  vector signed int vsumi3 = v0;
9175
 
9176
- const uint16_t * restrict q2 = x[i].qs;
9177
- const int8_t * restrict q8 = y[i].qs;
9178
 
9179
  for (int j = 0; j < QK_K/32; j += 2) {
9180
  __builtin_prefetch(q2, 0, 1);
@@ -9250,8 +9250,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9250
  __m256 accumf = (__m256)__lasx_xvldi(0);
9251
  for (int i = 0; i < nb; ++i) {
9252
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9253
- const uint16_t * restrict q2 = x[i].qs;
9254
- const int8_t * restrict q8 = y[i].qs;
9255
  __m256i sumi1 = __lasx_xvldi(0);
9256
  __m256i sumi2 = __lasx_xvldi(0);
9257
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9291,8 +9291,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9291
  //
9292
  // for (int i = 0; i < nb; ++i) {
9293
  // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9294
- // const uint16_t * restrict q2 = x[i].qs;
9295
- // const int8_t * restrict q8 = y[i].qs;
9296
  //
9297
  // float sumf1 = 0, sumf2 = 0;
9298
  //
@@ -9340,8 +9340,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9340
  float sumf = 0.f;
9341
  for (int i = 0; i < nb; ++i) {
9342
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9343
- const uint16_t * restrict q2 = x[i].qs;
9344
- const int8_t * restrict q8 = y[i].qs;
9345
  int32_t bsum = 0;
9346
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9347
  memcpy(aux32, q2, 2*sizeof(uint32_t));
@@ -9364,7 +9364,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9364
  #endif
9365
  }
9366
 
9367
- void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9368
  assert(n % QK_K == 0);
9369
  assert(nrc == 1);
9370
  UNUSED(nrc);
@@ -9372,8 +9372,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9372
  UNUSED(by);
9373
  UNUSED(bs);
9374
 
9375
- const block_iq2_xs * restrict x = vx;
9376
- const block_q8_K * restrict y = vy;
9377
 
9378
  const int nb = n / QK_K;
9379
 
@@ -9390,8 +9390,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9390
  float sumf = 0;
9391
  for (int i = 0; i < nb; ++i) {
9392
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9393
- const uint16_t * restrict q2 = x[i].qs;
9394
- const int8_t * restrict q8 = y[i].qs;
9395
  const uint8x8_t scales8 = vld1_u8(x[i].scales);
9396
  const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
9397
  const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
@@ -9468,8 +9468,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9468
  __m256 accumf = _mm256_setzero_ps();
9469
  for (int i = 0; i < nb; ++i) {
9470
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9471
- const uint16_t * restrict q2 = x[i].qs;
9472
- const int8_t * restrict q8 = y[i].qs;
9473
 
9474
  memcpy(&aux64, x[i].scales, 8);
9475
  __m128i stmp = _mm_set1_epi64x(aux64);
@@ -9589,8 +9589,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9589
  __m256 accumf = _mm256_setzero_ps();
9590
  for (int i = 0; i < nb; ++i) {
9591
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9592
- const uint16_t * restrict q2 = x[i].qs;
9593
- const int8_t * restrict q8 = y[i].qs;
9594
 
9595
  memcpy(&aux64, x[i].scales, 8);
9596
  __m128i stmp = _mm_set1_epi64x(aux64);
@@ -9744,8 +9744,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9744
  __m256 accumf = (__m256)__lasx_xvldi(0);
9745
  for (int i = 0; i < nb; ++i) {
9746
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9747
- const uint16_t * restrict q2 = x[i].qs;
9748
- const int8_t * restrict q8 = y[i].qs;
9749
 
9750
  memcpy(&aux64, x[i].scales, 8);
9751
  __m128i stmp = __lsx_vreplgr2vr_d(aux64);
@@ -9842,9 +9842,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9842
  vector signed int vsumi2 = v0;
9843
  vector signed int vsumi3 = v0;
9844
 
9845
- const uint16_t * restrict q2 = x[i].qs;
9846
- const uint8_t * restrict sc = x[i].scales;
9847
- const int8_t * restrict q8 = y[i].qs;
9848
 
9849
  for (int j = 0; j < QK_K/64; ++j) {
9850
  __builtin_prefetch(q2, 0, 1);
@@ -9914,9 +9914,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9914
  float sumf = 0.f;
9915
  for (int i = 0; i < nb; ++i) {
9916
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9917
- const uint16_t * restrict q2 = x[i].qs;
9918
- const uint8_t * restrict sc = x[i].scales;
9919
- const int8_t * restrict q8 = y[i].qs;
9920
  int32_t bsum = 0;
9921
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9922
  const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
@@ -9949,7 +9949,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9949
  #endif
9950
  }
9951
 
9952
- void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9953
  assert(n % QK_K == 0);
9954
  assert(nrc == 1);
9955
  UNUSED(nrc);
@@ -9957,8 +9957,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9957
  UNUSED(by);
9958
  UNUSED(bs);
9959
 
9960
- const block_iq2_s * restrict x = vx;
9961
- const block_q8_K * restrict y = vy;
9962
 
9963
  const int nb = n / QK_K;
9964
 
@@ -9984,10 +9984,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9984
 
9985
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9986
 
9987
- const uint8_t * restrict qs = x[i].qs;
9988
- const uint8_t * restrict qh = x[i].qh;
9989
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9990
- const int8_t * restrict q8 = y[i].qs;
9991
 
9992
  int sumi1 = 0, sumi2 = 0;
9993
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10058,10 +10058,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
10058
  __m256 accumf = _mm256_setzero_ps();
10059
  for (int i = 0; i < nb; ++i) {
10060
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10061
- const uint8_t * restrict qs = x[i].qs;
10062
- const uint8_t * restrict qh = x[i].qh;
10063
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
10064
- const int8_t * restrict q8 = y[i].qs;
10065
 
10066
  memcpy(&aux64, x[i].scales, 8);
10067
  const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
@@ -10131,10 +10131,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
10131
  __m256 accumf = _mm256_setzero_ps();
10132
  for (int i = 0; i < nb; ++i) {
10133
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10134
- const uint8_t * restrict qs = x[i].qs;
10135
- const uint8_t * restrict qh = x[i].qh;
10136
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
10137
- const int8_t * restrict q8 = y[i].qs;
10138
 
10139
  memcpy(&aux64, x[i].scales, 8);
10140
  const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
@@ -10229,11 +10229,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
10229
  vector signed int vsumi2 = v0;
10230
  vector signed int vsumi3 = v0;
10231
 
10232
- const uint8_t * restrict q2 = x[i].qs;
10233
- const uint8_t * restrict qh = x[i].qh;
10234
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
10235
- const uint8_t * restrict sc = x[i].scales;
10236
- const int8_t * restrict q8 = y[i].qs;
10237
 
10238
  for (int j = 0; j < QK_K/32; j += 2) {
10239
  __builtin_prefetch(q2, 0, 1);
@@ -10330,10 +10330,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
10330
  __m256 accumf = (__m256)__lasx_xvldi(0);
10331
  for (int i = 0; i < nb; ++i) {
10332
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10333
- const uint8_t * restrict qs = x[i].qs;
10334
- const uint8_t * restrict qh = x[i].qh;
10335
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
10336
- const int8_t * restrict q8 = y[i].qs;
10337
 
10338
  __m128i tmp1;
10339
  memcpy(&aux64, x[i].scales, 8);
@@ -10427,7 +10427,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
10427
 
10428
  }
10429
 
10430
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10431
  assert(n % QK_K == 0);
10432
  assert(nrc == 1);
10433
  UNUSED(nrc);
@@ -10435,8 +10435,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10435
  UNUSED(by);
10436
  UNUSED(bs);
10437
 
10438
- const block_iq3_xxs * restrict x = vx;
10439
- const block_q8_K * restrict y = vy;
10440
 
10441
  const int nb = n / QK_K;
10442
 
@@ -10452,9 +10452,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10452
  float sumf = 0;
10453
  for (int i = 0; i < nb; ++i) {
10454
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10455
- const uint8_t * restrict q3 = x[i].qs;
10456
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
10457
- const int8_t * restrict q8 = y[i].qs;
10458
  float sumf1 = 0, sumf2 = 0;
10459
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10460
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
@@ -10490,9 +10490,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10490
  __m256 accumf = _mm256_setzero_ps();
10491
  for (int i = 0; i < nb; ++i) {
10492
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10493
- const uint8_t * restrict q3 = x[i].qs;
10494
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
10495
- const int8_t * restrict q8 = y[i].qs;
10496
  __m256i sumi1 = _mm256_setzero_si256();
10497
  __m256i sumi2 = _mm256_setzero_si256();
10498
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10535,9 +10535,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10535
  __m256 accumf = _mm256_setzero_ps();
10536
  for (int i = 0; i < nb; ++i) {
10537
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10538
- const uint8_t * restrict q3 = x[i].qs;
10539
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
10540
- const int8_t * restrict q8 = y[i].qs;
10541
  __m128i sumi1_0 = _mm_setzero_si128();
10542
  __m128i sumi1_1 = _mm_setzero_si128();
10543
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -10604,9 +10604,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10604
  vector signed int vsumi2 = v0;
10605
  vector signed int vsumi3 = v0;
10606
 
10607
- const uint8_t * restrict q3 = x[i].qs;
10608
- const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
10609
- const int8_t * restrict q8 = y[i].qs;
10610
 
10611
  #pragma GCC unroll 1
10612
  for (int j = 0; j < QK_K/32; j += 2) {
@@ -10678,9 +10678,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10678
  __m256 accumf = (__m256)__lasx_xvldi(0);
10679
  for (int i = 0; i < nb; ++i) {
10680
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10681
- const uint8_t * restrict q3 = x[i].qs;
10682
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
10683
- const int8_t * restrict q8 = y[i].qs;
10684
  __m256i sumi1 = __lasx_xvldi(0);
10685
  __m256i sumi2 = __lasx_xvldi(0);
10686
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10723,9 +10723,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10723
  float sumf = 0.f;
10724
  for (int i = 0; i < nb; ++i) {
10725
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10726
- const uint8_t * restrict q3 = x[i].qs;
10727
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
10728
- const int8_t * restrict q8 = y[i].qs;
10729
  int32_t bsum = 0;
10730
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
10731
  memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
@@ -10750,7 +10750,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10750
  #endif
10751
  }
10752
 
10753
- void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10754
  assert(n % QK_K == 0);
10755
  assert(nrc == 1);
10756
  UNUSED(nrc);
@@ -10758,8 +10758,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10758
  UNUSED(by);
10759
  UNUSED(bs);
10760
 
10761
- const block_iq3_s * restrict x = vx;
10762
- const block_q8_K * restrict y = vy;
10763
 
10764
  const int nb = n / QK_K;
10765
 
@@ -10796,10 +10796,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10796
  float sumf = 0;
10797
  for (int i = 0; i < nb; ++i) {
10798
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10799
- const uint8_t * restrict qs = x[i].qs;
10800
- const uint8_t * restrict qh = x[i].qh;
10801
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10802
- const int8_t * restrict q8 = y[i].qs;
10803
 
10804
  memcpy(scales32, x[i].scales, 4);
10805
  scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
@@ -10878,10 +10878,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10878
  __m256 accumf = _mm256_setzero_ps();
10879
  for (int i = 0; i < nb; ++i) {
10880
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10881
- const uint8_t * restrict qs = x[i].qs;
10882
- const uint8_t * restrict qh = x[i].qh;
10883
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10884
- const int8_t * restrict q8 = y[i].qs;
10885
  __m256i sumi1 = _mm256_setzero_si256();
10886
  __m256i sumi2 = _mm256_setzero_si256();
10887
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10963,10 +10963,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10963
  __m256 accumf = _mm256_setzero_ps();
10964
  for (int i = 0; i < nb; ++i) {
10965
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10966
- const uint8_t * restrict qs = x[i].qs;
10967
- const uint8_t * restrict qh = x[i].qh;
10968
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10969
- const int8_t * restrict q8 = y[i].qs;
10970
  __m128i sumi1_0 = _mm_setzero_si128();
10971
  __m128i sumi1_1 = _mm_setzero_si128();
10972
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -11064,11 +11064,11 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
11064
  vector float vyd = vec_splats(y[i].d);
11065
  vector float vd = vec_mul(vxd, vyd);
11066
 
11067
- const uint8_t * restrict q3 = x[i].qs;
11068
- const uint8_t * restrict qh = x[i].qh;
11069
- const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
11070
- const uint8_t * restrict sc = x[i].scales;
11071
- const int8_t * restrict q8 = y[i].qs;
11072
 
11073
  vector signed int vsumi0 = v0;
11074
  vector signed int vsumi1 = v0;
@@ -11175,10 +11175,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
11175
  __m256 accumf = (__m256)__lasx_xvldi(0);
11176
  for (int i = 0; i < nb; ++i) {
11177
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
11178
- const uint8_t * restrict qs = x[i].qs;
11179
- const uint8_t * restrict qh = x[i].qh;
11180
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
11181
- const int8_t * restrict q8 = y[i].qs;
11182
  __m256i sumi1 = __lasx_xvldi(0);
11183
  __m256i sumi2 = __lasx_xvldi(0);
11184
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -11236,10 +11236,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
11236
  float sumf = 0.f;
11237
  for (int i = 0; i < nb; ++i) {
11238
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
11239
- const uint8_t * restrict qs = x[i].qs;
11240
- const uint8_t * restrict qh = x[i].qh;
11241
- const uint8_t * restrict signs = x[i].signs;
11242
- const int8_t * restrict q8 = y[i].qs;
11243
  int32_t bsum = 0;
11244
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
11245
  const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
@@ -11291,7 +11291,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
11291
  }
11292
  #endif
11293
 
11294
- void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11295
  assert(n % QK_K == 0);
11296
  assert(nrc == 1);
11297
  UNUSED(nrc);
@@ -11299,8 +11299,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
11299
  UNUSED(by);
11300
  UNUSED(bs);
11301
 
11302
- const block_iq1_s * restrict x = vx;
11303
- const block_q8_K * restrict y = vy;
11304
 
11305
  const int nb = n / QK_K;
11306
 
@@ -11458,10 +11458,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
11458
  vector signed int vsumi3 = vec_splats((int32_t)0);
11459
  vector signed int vsumi8 = vec_splats((int32_t)0);
11460
 
11461
- const uint8_t * restrict q1 = x[i].qs;
11462
- const uint16_t * restrict qh = x[i].qh;
11463
- const int8_t * restrict q8 = y[i].qs;
11464
- const int16_t * restrict qs = y[i].bsums;
11465
 
11466
  for (int j = 0; j < QK_K/32; j += 2) {
11467
  __builtin_prefetch(q1, 0, 1);
@@ -11622,7 +11622,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
11622
  #endif
11623
  }
11624
 
11625
- void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11626
  assert(n % QK_K == 0);
11627
  assert(nrc == 1);
11628
  UNUSED(nrc);
@@ -11630,8 +11630,8 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
11630
  UNUSED(by);
11631
  UNUSED(bs);
11632
 
11633
- const block_iq1_m * restrict x = vx;
11634
- const block_q8_K * restrict y = vy;
11635
 
11636
  const int nb = n / QK_K;
11637
 
@@ -11912,7 +11912,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
11912
  #endif
11913
  }
11914
 
11915
- void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11916
  assert(nrc == 1);
11917
  UNUSED(nrc);
11918
  UNUSED(bx);
@@ -11921,8 +11921,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11921
  assert(n % QK4_NL == 0);
11922
  static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
11923
 
11924
- const block_iq4_nl * restrict x = vx;
11925
- const block_q8_0 * restrict y = vy;
11926
 
11927
  const int nb = n / QK4_NL;
11928
 
@@ -12097,8 +12097,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
12097
  const uint8x16_t v_m = vec_splat_u8(0x0F);
12098
 
12099
  for (; ib < nb; ++ib) {
12100
- const block_iq4_nl * restrict x0 = &x[ib];
12101
- const block_q8_0 * restrict y0 = &y[ib];
12102
 
12103
  const uint8x16_t v_x = vec_xl(0, x0->qs);
12104
  int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
@@ -12126,7 +12126,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
12126
  *s = sumf;
12127
  }
12128
 
12129
- void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
12130
  assert(nrc == 1);
12131
  UNUSED(nrc);
12132
  UNUSED(bx);
@@ -12134,8 +12134,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
12134
  UNUSED(bs);
12135
  assert(n % QK_K == 0);
12136
 
12137
- const block_iq4_xs * restrict x = vx;
12138
- const block_q8_K * restrict y = vy;
12139
 
12140
  const int nb = n / QK_K;
12141
 
@@ -12292,9 +12292,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
12292
 
12293
  uint16_t h = x[ibl].scales_h;
12294
 
12295
- const uint8_t * restrict q4 = x[ibl].qs;
12296
- const uint8_t * restrict sc = x[ibl].scales_l;
12297
- const int8_t * restrict q8 = y[ibl].qs;
12298
 
12299
  for (int ib = 0; ib < QK_K/64; ib ++ ) {
12300
  __builtin_prefetch(q4, 0, 1);
@@ -12398,8 +12398,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
12398
  float sumf = 0;
12399
 
12400
  for (int ibl = 0; ibl < nb; ++ibl) {
12401
- const uint8_t * restrict q4 = x[ibl].qs;
12402
- const int8_t * restrict q8 = y[ibl].qs;
12403
 
12404
  uint16_t h = x[ibl].scales_h;
12405
 
@@ -12479,12 +12479,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
12479
 
12480
  // ============================ 4-bit non-linear quants
12481
 
12482
- void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) {
12483
  assert(k % QK4_NL == 0);
12484
  quantize_row_iq4_nl_ref(x, y, k);
12485
  }
12486
 
12487
- void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k) {
12488
  assert(k % QK_K == 0);
12489
  quantize_iq4_xs(x, y, 1, k, NULL);
12490
  }
 
719
  }
720
  #endif //__loongarch_asx
721
 
722
+ void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
723
  quantize_row_q4_0_ref(x, y, k);
724
  }
725
 
726
+ void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
727
  quantize_row_q4_1_ref(x, y, k);
728
  }
729
 
730
+ void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
731
  quantize_row_q5_0_ref(x, y, k);
732
  }
733
 
734
+ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
735
  quantize_row_q5_1_ref(x, y, k);
736
  }
737
 
738
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
739
  assert(QK8_0 == 32);
740
  assert(k % QK8_0 == 0);
741
  const int nb = k / QK8_0;
742
 
743
+ block_q8_0 * GGML_RESTRICT y = vy;
744
 
745
  #if defined(__ARM_NEON)
746
  for (int i = 0; i < nb; i++) {
 
1050
  #endif
1051
  }
1052
 
1053
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1054
  assert(k % QK8_1 == 0);
1055
  const int nb = k / QK8_1;
1056
 
1057
+ block_q8_1 * GGML_RESTRICT y = vy;
1058
 
1059
  #if defined(__ARM_NEON)
1060
  for (int i = 0; i < nb; i++) {
 
1428
  return (i & 0x007fffff) - 0x00400000;
1429
  }
1430
 
1431
+ static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
1432
+ const float * GGML_RESTRICT qw) {
1433
  float max = 0;
1434
  float amax = 0;
1435
  for (int i = 0; i < n; ++i) {
 
1497
  return scale;
1498
  }
1499
 
1500
+ static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
1501
  float max = 0;
1502
  float amax = 0;
1503
  for (int i = 0; i < n; ++i) {
 
1556
  return 1/iscale;
1557
  }
1558
 
1559
+ static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
1560
  int ntry, float alpha) {
1561
  float min = x[0];
1562
  float max = x[0];
 
1599
  return scale;
1600
  }
1601
 
1602
+ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
1603
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
1604
  float rmin, float rdelta, int nstep, bool use_mad) {
1605
  float min = x[0];
1606
  float max = x[0];
 
1680
  return scale;
1681
  }
1682
 
1683
+ static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
1684
  if (j < 4) {
1685
  *d = q[j] & 63; *m = q[j + 4] & 63;
1686
  } else {
 
1691
 
1692
  //========================- 2-bit (de)-quantization
1693
 
1694
+ void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1695
  quantize_row_q2_K_ref(x, vy, k);
1696
  }
1697
 
1698
  //========================= 3-bit (de)-quantization
1699
 
1700
+ void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1701
  quantize_row_q3_K_ref(x, vy, k);
1702
  }
1703
 
1704
  // ====================== 4-bit (de)-quantization
1705
 
1706
+ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1707
  assert(k % QK_K == 0);
1708
+ block_q4_K * GGML_RESTRICT y = vy;
1709
  quantize_row_q4_K_ref(x, y, k);
1710
  }
1711
 
1712
  // ====================== 5-bit (de)-quantization
1713
 
1714
+ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1715
  assert(k % QK_K == 0);
1716
+ block_q5_K * GGML_RESTRICT y = vy;
1717
  quantize_row_q5_K_ref(x, y, k);
1718
  }
1719
 
1720
  // ====================== 6-bit (de)-quantization
1721
 
1722
+ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1723
  assert(k % QK_K == 0);
1724
+ block_q6_K * GGML_RESTRICT y = vy;
1725
  quantize_row_q6_K_ref(x, y, k);
1726
  }
1727
 
1728
  // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
1729
 
1730
+ void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1731
  assert(k % QK_K == 0);
1732
+ block_tq1_0 * GGML_RESTRICT y = vy;
1733
  quantize_row_tq1_0_ref(x, y, k);
1734
  }
1735
 
1736
+ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1737
  assert(k % QK_K == 0);
1738
+ block_tq2_0 * GGML_RESTRICT y = vy;
1739
  quantize_row_tq2_0_ref(x, y, k);
1740
  }
1741
 
 
1743
 
1744
  //===================================== Q8_K ==============================================
1745
 
1746
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1747
  #ifdef __wasm_simd128__
1748
  assert(k % QK_K == 0);
1749
  const int64_t nb = k / QK_K;
1750
+ block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
1751
 
1752
  for (int i = 0; i < nb; i++) {
1753
  const float * x_block = x + i * QK_K;
 
1909
  }
1910
  #endif
1911
 
1912
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1913
  const int qk = QK8_0;
1914
  const int nb = n / qk;
1915
 
 
1924
  UNUSED(by);
1925
  UNUSED(bs);
1926
 
1927
+ const block_q4_0 * GGML_RESTRICT x = vx;
1928
+ const block_q8_0 * GGML_RESTRICT y = vy;
1929
 
1930
  #if defined(__ARM_FEATURE_MATMUL_INT8)
1931
  if (nrc == 2) {
1932
+ const block_q4_0 * GGML_RESTRICT vx0 = vx;
1933
+ const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
1934
+ const block_q8_0 * GGML_RESTRICT vy0 = vy;
1935
+ const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
1936
 
1937
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
1938
 
1939
  for (int i = 0; i < nb; i++) {
1940
+ const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i];
1941
+ const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i];
1942
+ const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
1943
+ const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
1944
 
1945
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
1946
  const int8x16_t s8b = vdupq_n_s8(0x8);
 
2017
  const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
2018
 
2019
  for (; ib + 1 < nb; ib += 2) {
2020
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2021
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2022
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2023
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2024
 
2025
  // load x
2026
  const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
 
2063
  const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
2064
 
2065
  for (; ib + 1 < nb; ib += 2) {
2066
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2067
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2068
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2069
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2070
 
2071
  // load x
2072
  const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
 
2104
  const svbool_t pl16 = svnot_b_z(ph32, ph16);
2105
 
2106
  for (; ib + 1 < nb; ib += 2) {
2107
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2108
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2109
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2110
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2111
 
2112
  // load x
2113
  const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
 
2144
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
2145
 
2146
  for (; ib + 1 < nb; ib += 2) {
2147
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2148
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2149
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2150
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2151
 
2152
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2153
  const int8x16_t s8b = vdupq_n_s8(0x8);
 
2189
  const v128_t s8b = wasm_i8x16_splat(0x8);
2190
 
2191
  for (; ib + 1 < nb; ib += 2) {
2192
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
2193
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2194
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2195
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2196
 
2197
  // Load and process x0
2198
  v128_t v0_0 = wasm_v128_load(x0->qs);
 
2609
  *s = sumf;
2610
  }
2611
 
2612
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2613
  const int qk = QK8_1;
2614
  const int nb = n / qk;
2615
 
 
2624
  UNUSED(by);
2625
  UNUSED(bs);
2626
 
2627
+ const block_q4_1 * GGML_RESTRICT x = vx;
2628
+ const block_q8_1 * GGML_RESTRICT y = vy;
2629
 
2630
  #if defined(__ARM_FEATURE_MATMUL_INT8)
2631
  if (nrc == 2) {
2632
+ const block_q4_1 * GGML_RESTRICT vx0 = vx;
2633
+ const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
2634
+ const block_q8_1 * GGML_RESTRICT vy0 = vy;
2635
+ const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
2636
 
2637
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2638
  float32x4_t summs0 = vdupq_n_f32(0.0f);
2639
 
2640
  for (int i = 0; i < nb; i++) {
2641
+ const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i];
2642
+ const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i];
2643
+ const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i];
2644
+ const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
2645
 
2646
  float32_t summs_t[4] = {
2647
  GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
 
2715
  float summs = 0;
2716
 
2717
  for (; ib + 1 < nb; ib += 2) {
2718
+ const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0];
2719
+ const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1];
2720
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
2721
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
2722
 
2723
  summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
2724
 
 
2931
  *s = sumf;
2932
  }
2933
 
2934
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2935
  const int qk = QK8_0;
2936
  const int nb = n / qk;
2937
 
 
2946
  UNUSED(by);
2947
  UNUSED(bs);
2948
 
2949
+ const block_q5_0 * GGML_RESTRICT x = vx;
2950
+ const block_q8_0 * GGML_RESTRICT y = vy;
2951
 
2952
  #if defined(__ARM_NEON)
2953
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
 
2960
  uint64_t tmp1[4];
2961
 
2962
  for (; ib + 1 < nb; ib += 2) {
2963
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
2964
+ const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
2965
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2966
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2967
 
2968
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2969
 
 
3024
 
3025
  // TODO: check if unrolling this is better
3026
  for (; ib < nb; ++ib) {
3027
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
3028
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
3029
 
3030
  const v128_t m4b = wasm_i8x16_splat(0x0F);
3031
 
 
3286
  *s = sumf;
3287
  }
3288
 
3289
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3290
  const int qk = QK8_1;
3291
  const int nb = n / qk;
3292
 
 
3301
  UNUSED(by);
3302
  UNUSED(bs);
3303
 
3304
+ const block_q5_1 * GGML_RESTRICT x = vx;
3305
+ const block_q8_1 * GGML_RESTRICT y = vy;
3306
 
3307
  #if defined(__ARM_NEON)
3308
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
 
3318
  uint64_t tmp1[4];
3319
 
3320
  for (; ib + 1 < nb; ib += 2) {
3321
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
3322
+ const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
3323
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
3324
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
3325
 
3326
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
3327
 
 
3387
 
3388
  // TODO: check if unrolling this is better
3389
  for (; ib < nb; ++ib) {
3390
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
3391
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
3392
 
3393
  summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
3394
 
 
3660
  *s = sumf;
3661
  }
3662
 
3663
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3664
  const int qk = QK8_0;
3665
  const int nb = n / qk;
3666
 
 
3675
  UNUSED(by);
3676
  UNUSED(bs);
3677
 
3678
+ const block_q8_0 * GGML_RESTRICT x = vx;
3679
+ const block_q8_0 * GGML_RESTRICT y = vy;
3680
 
3681
  #if defined(__ARM_FEATURE_MATMUL_INT8)
3682
  if (nrc == 2) {
3683
+ const block_q8_0 * GGML_RESTRICT vx0 = vx;
3684
+ const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
3685
+ const block_q8_0 * GGML_RESTRICT vy0 = vy;
3686
+ const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
3687
 
3688
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3689
 
3690
  for (int i = 0; i < nb; i++) {
3691
+ const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i];
3692
+ const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
3693
 
3694
+ const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i];
3695
+ const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
3696
 
3697
  const int8x16_t x0_l = vld1q_s8(b_x0->qs);
3698
  const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
 
3757
  const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
3758
 
3759
  for (; ib + 1 < nb; ib += 2) {
3760
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3761
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3762
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3763
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3764
 
3765
  // load x
3766
  const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
 
3788
  {
3789
  //printf("sve256");
3790
  for (; ib + 1 < nb; ib += 2) {
3791
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3792
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3793
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3794
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3795
 
3796
  // load x
3797
  const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
 
3824
  svfloat32_t sumv00 = svdup_n_f32(0.0f);
3825
 
3826
  for (; ib + 1 < nb; ib += 2) {
3827
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3828
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3829
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3830
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3831
 
3832
  //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
3833
  // and add them to make one 64 element vector
 
3867
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3868
 
3869
  for (; ib + 1 < nb; ib += 2) {
3870
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3871
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3872
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3873
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3874
 
3875
  const int8x16_t x0_0 = vld1q_s8(x0->qs);
3876
  const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
 
3897
  v128_t sumv = wasm_f32x4_splat(0.0f);
3898
 
3899
  for (; ib < nb; ++ib) {
3900
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
3901
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
3902
 
3903
  const v128_t x0_0 = wasm_v128_load(x0->qs);
3904
  const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
 
4080
  *s = sumf;
4081
  }
4082
 
4083
+ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4084
  assert(nrc == 1);
4085
  UNUSED(nrc);
4086
  UNUSED(bx);
4087
  UNUSED(by);
4088
  UNUSED(bs);
4089
 
4090
+ const block_tq1_0 * GGML_RESTRICT x = vx;
4091
+ const block_q8_K * GGML_RESTRICT y = vy;
4092
 
4093
  const int nb = n / QK_K;
4094
 
 
4403
  #endif
4404
  }
4405
 
4406
+ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4407
  assert(nrc == 1);
4408
  UNUSED(nrc);
4409
  UNUSED(bx);
4410
  UNUSED(by);
4411
  UNUSED(bs);
4412
 
4413
+ const block_tq2_0 * GGML_RESTRICT x = vx;
4414
+ const block_q8_K * GGML_RESTRICT y = vy;
4415
 
4416
  const int nb = n / QK_K;
4417
 
 
4575
  #endif
4576
  }
4577
 
4578
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4579
  assert(nrc == 1);
4580
  UNUSED(nrc);
4581
  UNUSED(bx);
4582
  UNUSED(by);
4583
  UNUSED(bs);
4584
 
4585
+ const block_q2_K * GGML_RESTRICT x = vx;
4586
+ const block_q8_K * GGML_RESTRICT y = vy;
4587
 
4588
  const int nb = n / QK_K;
4589
 
 
4603
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4604
  svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
4605
 
4606
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4607
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
4608
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4609
 
4610
  svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
4611
  const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
 
4748
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4749
  svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
4750
 
4751
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4752
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
4753
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4754
 
4755
  const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
4756
  const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
 
4847
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4848
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4849
 
4850
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4851
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4852
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4853
 
4854
  const uint8x16_t mins_and_scales = vld1q_u8(sc);
4855
  const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
 
4912
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4913
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4914
 
4915
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4916
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4917
 
4918
  const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
4919
  const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
 
4979
  const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4980
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4981
 
4982
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4983
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4984
 
4985
  // load mins and scales from block_q2_K.scales[QK_K/16]
4986
  const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
 
5306
  vector signed int vsumi6 = v0;
5307
  vector signed int vsumi7 = v0;
5308
 
5309
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
5310
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5311
 
5312
  for (int j = 0; j < QK_K/128; ++j) {
5313
  __builtin_prefetch(q2, 0, 1);
 
5398
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5399
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5400
 
5401
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
5402
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5403
 
5404
  const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
5405
  const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
 
5492
  #endif
5493
  }
5494
 
5495
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
5496
  assert(n % QK_K == 0);
5497
  assert(nrc == 1);
5498
  UNUSED(nrc);
 
5503
  const uint32_t kmask1 = 0x03030303;
5504
  const uint32_t kmask2 = 0x0f0f0f0f;
5505
 
5506
+ const block_q3_K * GGML_RESTRICT x = vx;
5507
+ const block_q8_K * GGML_RESTRICT y = vy;
5508
 
5509
  const int nb = n / QK_K;
5510
 
 
5529
 
5530
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5531
 
5532
+ const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
5533
+ const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
5534
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
5535
 
5536
  // Set up scales
5537
  memcpy(aux, x[i].scales, 12);
 
5705
 
5706
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5707
 
5708
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5709
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
5710
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5711
 
5712
  ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
5713
 
 
5791
 
5792
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5793
 
5794
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5795
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5796
 
5797
  // Set up scales
5798
  memcpy(aux, x[i].scales, 12);
 
5896
 
5897
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5898
 
5899
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5900
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5901
 
5902
  // Set up scales
5903
  aux = (const uint32_t *)x[i].scales;
 
6030
 
6031
  float sumf = 0;
6032
  for (int i = 0; i < nb; ++i) {
6033
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6034
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
6035
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6036
 
6037
  // Process blocks with SIMD
6038
  int8_t * a = aux8;
 
6119
  float sumf = 0;
6120
  for (int i = 0; i < nb; ++i) {
6121
 
6122
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6123
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
6124
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6125
 
6126
  memcpy(aux, x[i].scales, 12);
6127
  utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
 
6261
  vector signed int vsumi6 = v0;
6262
  vector signed int vsumi7 = v0;
6263
 
6264
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6265
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6266
 
6267
  for (int j = 0; j < QK_K/128; ++j) {
6268
  __builtin_prefetch(q3, 0, 1);
 
6375
  for (int i = 0; i < nb; ++i) {
6376
 
6377
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6378
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6379
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6380
  // Set up scales
6381
  memcpy(aux, x[i].scales, 12);
6382
  __m128i scales128 = lsx_set_w(
 
6461
 
6462
  float sumf = 0;
6463
  for (int i = 0; i < nb; ++i) {
6464
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6465
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
6466
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6467
  memset(aux32, 0, 8*sizeof(int32_t));
6468
+ int8_t * GGML_RESTRICT a = aux8;
6469
  uint8_t m = 1;
6470
  for (int j = 0; j < QK_K; j += 128) {
6471
  for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
 
6508
 
6509
  }
6510
 
6511
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
6512
  assert(n % QK_K == 0);
6513
  assert(nrc == 1);
6514
  UNUSED(nrc);
 
6516
  UNUSED(by);
6517
  UNUSED(bs);
6518
 
6519
+ const block_q4_K * GGML_RESTRICT x = vx;
6520
+ const block_q8_K * GGML_RESTRICT y = vy;
6521
 
6522
  const int nb = n / QK_K;
6523
 
 
6552
 
6553
  const uint8_t * scales = (const uint8_t *)utmp;
6554
 
6555
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6556
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6557
 
6558
  const int vector_length = ggml_cpu_get_sve_cnt()*8;
6559
  const svuint8_t m4b = svdup_n_u8(0xf);
 
6640
 
6641
  const uint8_t * scales = (const uint8_t *)utmp;
6642
 
6643
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6644
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6645
 
6646
  int32_t sumi1 = 0;
6647
  int32_t sumi2 = 0;
 
6679
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6680
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
6681
 
6682
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6683
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6684
 
6685
  // Process scales and mins
6686
  memcpy(utmp, x[i].scales, 12);
 
6692
 
6693
  // Sum mins * q8sums
6694
  int32_t sumi = 0;
6695
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
6696
  const uint8_t * m = (const uint8_t *)&utmp[2];
6697
  for (int j = 0; j < 16; j += 2) {
6698
  sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
 
6791
  utmp[2] = uaux;
6792
  utmp[0] &= kmask1;
6793
 
6794
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6795
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6796
 
6797
  const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
6798
 
 
6850
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6851
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
6852
 
6853
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6854
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6855
 
6856
  memcpy(utmp, x[i].scales, 12);
6857
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
 
6951
  vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
6952
  sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
6953
 
6954
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6955
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6956
 
6957
  vl = 32;
6958
 
 
7053
  vector signed int vsumi2 = v0;
7054
  vector signed int vsumi3 = v0;
7055
 
7056
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7057
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7058
 
7059
  for (int j = 0; j < QK_K/64; j+=2) {
7060
  __builtin_prefetch(q4, 0, 1);
 
7145
  utmp[2] = uaux;
7146
  utmp[0] &= kmask1;
7147
 
7148
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7149
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7150
 
7151
  const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
7152
  const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
 
7228
  sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
7229
 
7230
  const uint8_t * scales = (const uint8_t *)utmp;
7231
+ const uint8_t * GGML_RESTRICT x0 = x[i].qs;
7232
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
7233
 
7234
  int32_t sumi1 = 0;
7235
  int32_t sumi2 = 0;
 
7277
 
7278
  float sumf = 0;
7279
  for (int i = 0; i < nb; ++i) {
7280
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7281
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7282
  memset(aux32, 0, 8*sizeof(int32_t));
7283
+ int8_t * GGML_RESTRICT a = aux8;
7284
  for (int j = 0; j < QK_K/64; ++j) {
7285
  for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
7286
  a += 32;
 
7323
  #endif
7324
  }
7325
 
7326
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
7327
  assert(n % QK_K == 0);
7328
  assert(nrc == 1);
7329
  UNUSED(nrc);
 
7331
  UNUSED(by);
7332
  UNUSED(bs);
7333
 
7334
+ const block_q5_K * GGML_RESTRICT x = vx;
7335
+ const block_q8_K * GGML_RESTRICT y = vy;
7336
 
7337
  const int nb = n / QK_K;
7338
 
 
7374
 
7375
  const uint8_t * scales = (const uint8_t *)utmp;
7376
 
7377
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7378
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
7379
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7380
 
7381
  ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
7382
 
 
7421
  float summs = 0.f;
7422
 
7423
  for (int i = 0; i < nb; ++i) {
7424
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7425
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7426
 
7427
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7428
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
7505
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7506
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
7507
 
7508
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7509
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7510
 
7511
  memcpy(utmp, x[i].scales, 12);
7512
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
 
7597
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7598
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
7599
 
7600
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7601
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
7602
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7603
 
7604
  // Process scales and mins
7605
  memcpy(utmp, x[i].scales, 12);
 
7611
 
7612
  // Sum mins * q8sums
7613
  int32_t sumi_mins = 0;
7614
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
7615
  const uint8_t * m = (const uint8_t *)&utmp[2];
7616
  for (int j = 0; j < 16; j += 2) {
7617
  sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
 
7715
 
7716
  vl = 8;
7717
 
7718
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7719
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
7720
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7721
 
7722
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7723
  const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
 
7856
  vector signed int vsumi2 = v0;
7857
  vector signed int vsumi3 = v0;
7858
 
7859
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7860
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7861
 
7862
  for (int j = 0; j < QK_K/64; ++j) {
7863
  __builtin_prefetch(q5, 0, 1);
 
7929
 
7930
  for (int i = 0; i < nb; ++i) {
7931
 
7932
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7933
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7934
 
7935
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7936
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
8039
  const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
8040
 
8041
  const uint8_t * scales = (const uint8_t *)utmp;
8042
+ const uint8_t * GGML_RESTRICT x0l = x[i].qs;
8043
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
8044
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
8045
 
8046
  v_xh[0] = vec_xl(0 , x0h);
8047
  v_xh[1] = vec_xl(16, x0h);
 
8094
 
8095
  float sumf = 0;
8096
  for (int i = 0; i < nb; ++i) {
8097
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
8098
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
8099
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8100
  memset(aux32, 0, 8*sizeof(int32_t));
8101
+ int8_t * GGML_RESTRICT a = aux8;
8102
  uint8_t m = 1;
8103
  for (int j = 0; j < QK_K/64; ++j) {
8104
  for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
 
8145
  #endif
8146
  }
8147
 
8148
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
8149
  assert(n % QK_K == 0);
8150
  assert(nrc == 1);
8151
  UNUSED(nrc);
 
8153
  UNUSED(by);
8154
  UNUSED(bs);
8155
 
8156
+ const block_q6_K * GGML_RESTRICT x = vx;
8157
+ const block_q8_K * GGML_RESTRICT y = vy;
8158
 
8159
  const int nb = n / QK_K;
8160
 
 
8174
 
8175
  const float d_all = GGML_FP16_TO_FP32(x[i].d);
8176
 
8177
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8178
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8179
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8180
 
8181
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
8182
 
8183
  const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
8184
  const int8x16_t scales = vld1q_s8(scale);
 
8265
 
8266
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8267
 
8268
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8269
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8270
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8271
 
8272
  const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
8273
 
 
8343
 
8344
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8345
 
8346
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8347
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8348
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8349
 
8350
  // handle the q6_k -32 offset separately using bsums
8351
  const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
 
8444
 
8445
  for (int i = 0; i < nb; ++i) {
8446
  // Unpack 6-bit quantized data into aux8 (unchanged)
8447
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8448
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8449
  int8_t * a = aux8;
8450
  for (int j = 0; j < QK_K; j += 128) {
8451
  for (int l = 0; l < 32; ++l) {
 
8459
  qh += 32;
8460
  }
8461
 
8462
+ const int8_t * GGML_RESTRICT a_ptr = aux8;
8463
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8464
  v128_t acc0 = wasm_i32x4_splat(0);
8465
  v128_t acc1 = wasm_i32x4_splat(0);
8466
 
 
8523
 
8524
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8525
 
8526
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8527
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8528
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8529
 
8530
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
8531
 
8532
  size_t vl;
8533
 
 
8629
  vector signed int vsumi6 = v0;
8630
  vector signed int vsumi7 = v0;
8631
 
8632
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8633
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8634
+ const int8_t * GGML_RESTRICT qs = x[i].scales;
8635
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8636
 
8637
  for (int j = 0; j < QK_K/128; ++j) {
8638
  __builtin_prefetch(q6, 0, 0);
 
8748
 
8749
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8750
 
8751
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8752
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8753
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8754
 
8755
  const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
8756
  const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
 
8816
  for (int i = 0; i < nb; ++i) {
8817
  const float d_all = GGML_FP16_TO_FP32(x[i].d);
8818
 
8819
+ const uint8_t * GGML_RESTRICT x0l = x[i].ql;
8820
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
8821
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
8822
 
8823
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
8824
 
8825
  const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
8826
  const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
 
8931
 
8932
  float sumf = 0;
8933
  for (int i = 0; i < nb; ++i) {
8934
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8935
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8936
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8937
  memset(aux32, 0, 8*sizeof(int32_t));
8938
+ int8_t * GGML_RESTRICT a = aux8;
8939
  for (int j = 0; j < QK_K; j += 128) {
8940
  for (int l = 0; l < 32; ++l) {
8941
  a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
 
9003
  };
9004
  #endif
9005
 
9006
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9007
  assert(n % QK_K == 0);
9008
  assert(nrc == 1);
9009
  UNUSED(nrc);
 
9011
  UNUSED(by);
9012
  UNUSED(bs);
9013
 
9014
+ const block_iq2_xxs * GGML_RESTRICT x = vx;
9015
+ const block_q8_K * GGML_RESTRICT y = vy;
9016
 
9017
  const int nb = n / QK_K;
9018
 
 
9030
  float sumf = 0;
9031
  for (int i = 0; i < nb; ++i) {
9032
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9033
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9034
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9035
  float sumf1 = 0, sumf2 = 0;
9036
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9037
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
 
9067
  __m256 accumf = _mm256_setzero_ps();
9068
  for (int i = 0; i < nb; ++i) {
9069
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9070
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9071
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9072
  __m256i sumi1 = _mm256_setzero_si256();
9073
  __m256i sumi2 = _mm256_setzero_si256();
9074
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
 
9108
  __m256 accumf = _mm256_setzero_ps();
9109
  for (int i = 0; i < nb; ++i) {
9110
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9111
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9112
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9113
  __m128i sumi1_0 = _mm_setzero_si128();
9114
  __m128i sumi1_1 = _mm_setzero_si128();
9115
  __m128i sumi2_0 = _mm_setzero_si128();
 
9173
  vector signed int vsumi2 = v0;
9174
  vector signed int vsumi3 = v0;
9175
 
9176
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9177
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9178
 
9179
  for (int j = 0; j < QK_K/32; j += 2) {
9180
  __builtin_prefetch(q2, 0, 1);
 
9250
  __m256 accumf = (__m256)__lasx_xvldi(0);
9251
  for (int i = 0; i < nb; ++i) {
9252
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9253
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9254
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9255
  __m256i sumi1 = __lasx_xvldi(0);
9256
  __m256i sumi2 = __lasx_xvldi(0);
9257
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
 
9291
  //
9292
  // for (int i = 0; i < nb; ++i) {
9293
  // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9294
+ // const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9295
+ // const int8_t * GGML_RESTRICT q8 = y[i].qs;
9296
  //
9297
  // float sumf1 = 0, sumf2 = 0;
9298
  //
 
9340
  float sumf = 0.f;
9341
  for (int i = 0; i < nb; ++i) {
9342
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9343
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9344
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9345
  int32_t bsum = 0;
9346
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9347
  memcpy(aux32, q2, 2*sizeof(uint32_t));
 
9364
  #endif
9365
  }
9366
 
9367
+ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9368
  assert(n % QK_K == 0);
9369
  assert(nrc == 1);
9370
  UNUSED(nrc);
 
9372
  UNUSED(by);
9373
  UNUSED(bs);
9374
 
9375
+ const block_iq2_xs * GGML_RESTRICT x = vx;
9376
+ const block_q8_K * GGML_RESTRICT y = vy;
9377
 
9378
  const int nb = n / QK_K;
9379
 
 
9390
  float sumf = 0;
9391
  for (int i = 0; i < nb; ++i) {
9392
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9393
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9394
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9395
  const uint8x8_t scales8 = vld1_u8(x[i].scales);
9396
  const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
9397
  const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
 
9468
  __m256 accumf = _mm256_setzero_ps();
9469
  for (int i = 0; i < nb; ++i) {
9470
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9471
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9472
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9473
 
9474
  memcpy(&aux64, x[i].scales, 8);
9475
  __m128i stmp = _mm_set1_epi64x(aux64);
 
9589
  __m256 accumf = _mm256_setzero_ps();
9590
  for (int i = 0; i < nb; ++i) {
9591
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9592
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9593
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9594
 
9595
  memcpy(&aux64, x[i].scales, 8);
9596
  __m128i stmp = _mm_set1_epi64x(aux64);
 
9744
  __m256 accumf = (__m256)__lasx_xvldi(0);
9745
  for (int i = 0; i < nb; ++i) {
9746
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9747
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9748
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9749
 
9750
  memcpy(&aux64, x[i].scales, 8);
9751
  __m128i stmp = __lsx_vreplgr2vr_d(aux64);
 
9842
  vector signed int vsumi2 = v0;
9843
  vector signed int vsumi3 = v0;
9844
 
9845
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9846
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
9847
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9848
 
9849
  for (int j = 0; j < QK_K/64; ++j) {
9850
  __builtin_prefetch(q2, 0, 1);
 
9914
  float sumf = 0.f;
9915
  for (int i = 0; i < nb; ++i) {
9916
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9917
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9918
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
9919
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9920
  int32_t bsum = 0;
9921
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9922
  const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
 
9949
  #endif
9950
  }
9951
 
9952
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9953
  assert(n % QK_K == 0);
9954
  assert(nrc == 1);
9955
  UNUSED(nrc);
 
9957
  UNUSED(by);
9958
  UNUSED(bs);
9959
 
9960
+ const block_iq2_s * GGML_RESTRICT x = vx;
9961
+ const block_q8_K * GGML_RESTRICT y = vy;
9962
 
9963
  const int nb = n / QK_K;
9964
 
 
9984
 
9985
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9986
 
9987
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
9988
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
9989
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
9990
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9991
 
9992
  int sumi1 = 0, sumi2 = 0;
9993
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
 
10058
  __m256 accumf = _mm256_setzero_ps();
10059
  for (int i = 0; i < nb; ++i) {
10060
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10061
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10062
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10063
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10064
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10065
 
10066
  memcpy(&aux64, x[i].scales, 8);
10067
  const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
 
10131
  __m256 accumf = _mm256_setzero_ps();
10132
  for (int i = 0; i < nb; ++i) {
10133
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10134
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10135
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10136
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10137
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10138
 
10139
  memcpy(&aux64, x[i].scales, 8);
10140
  const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
 
10229
  vector signed int vsumi2 = v0;
10230
  vector signed int vsumi3 = v0;
10231
 
10232
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
10233
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10234
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10235
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
10236
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10237
 
10238
  for (int j = 0; j < QK_K/32; j += 2) {
10239
  __builtin_prefetch(q2, 0, 1);
 
10330
  __m256 accumf = (__m256)__lasx_xvldi(0);
10331
  for (int i = 0; i < nb; ++i) {
10332
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10333
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10334
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10335
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10336
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10337
 
10338
  __m128i tmp1;
10339
  memcpy(&aux64, x[i].scales, 8);
 
10427
 
10428
  }
10429
 
10430
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10431
  assert(n % QK_K == 0);
10432
  assert(nrc == 1);
10433
  UNUSED(nrc);
 
10435
  UNUSED(by);
10436
  UNUSED(bs);
10437
 
10438
+ const block_iq3_xxs * GGML_RESTRICT x = vx;
10439
+ const block_q8_K * GGML_RESTRICT y = vy;
10440
 
10441
  const int nb = n / QK_K;
10442
 
 
10452
  float sumf = 0;
10453
  for (int i = 0; i < nb; ++i) {
10454
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10455
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10456
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10457
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10458
  float sumf1 = 0, sumf2 = 0;
10459
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10460
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
 
10490
  __m256 accumf = _mm256_setzero_ps();
10491
  for (int i = 0; i < nb; ++i) {
10492
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10493
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10494
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10495
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10496
  __m256i sumi1 = _mm256_setzero_si256();
10497
  __m256i sumi2 = _mm256_setzero_si256();
10498
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
 
10535
  __m256 accumf = _mm256_setzero_ps();
10536
  for (int i = 0; i < nb; ++i) {
10537
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10538
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10539
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10540
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10541
  __m128i sumi1_0 = _mm_setzero_si128();
10542
  __m128i sumi1_1 = _mm_setzero_si128();
10543
  __m128i sumi2_0 = _mm_setzero_si128();
 
10604
  vector signed int vsumi2 = v0;
10605
  vector signed int vsumi3 = v0;
10606
 
10607
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10608
+ const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
10609
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10610
 
10611
  #pragma GCC unroll 1
10612
  for (int j = 0; j < QK_K/32; j += 2) {
 
10678
  __m256 accumf = (__m256)__lasx_xvldi(0);
10679
  for (int i = 0; i < nb; ++i) {
10680
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10681
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10682
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10683
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10684
  __m256i sumi1 = __lasx_xvldi(0);
10685
  __m256i sumi2 = __lasx_xvldi(0);
10686
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
 
10723
  float sumf = 0.f;
10724
  for (int i = 0; i < nb; ++i) {
10725
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10726
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10727
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10728
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10729
  int32_t bsum = 0;
10730
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
10731
  memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
 
10750
  #endif
10751
  }
10752
 
10753
+ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10754
  assert(n % QK_K == 0);
10755
  assert(nrc == 1);
10756
  UNUSED(nrc);
 
10758
  UNUSED(by);
10759
  UNUSED(bs);
10760
 
10761
+ const block_iq3_s * GGML_RESTRICT x = vx;
10762
+ const block_q8_K * GGML_RESTRICT y = vy;
10763
 
10764
  const int nb = n / QK_K;
10765
 
 
10796
  float sumf = 0;
10797
  for (int i = 0; i < nb; ++i) {
10798
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10799
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10800
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10801
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
10802
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10803
 
10804
  memcpy(scales32, x[i].scales, 4);
10805
  scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
 
10878
  __m256 accumf = _mm256_setzero_ps();
10879
  for (int i = 0; i < nb; ++i) {
10880
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10881
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10882
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10883
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
10884
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10885
  __m256i sumi1 = _mm256_setzero_si256();
10886
  __m256i sumi2 = _mm256_setzero_si256();
10887
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
 
10963
  __m256 accumf = _mm256_setzero_ps();
10964
  for (int i = 0; i < nb; ++i) {
10965
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10966
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10967
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10968
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
10969
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10970
  __m128i sumi1_0 = _mm_setzero_si128();
10971
  __m128i sumi1_1 = _mm_setzero_si128();
10972
  __m128i sumi2_0 = _mm_setzero_si128();
 
11064
  vector float vyd = vec_splats(y[i].d);
11065
  vector float vd = vec_mul(vxd, vyd);
11066
 
11067
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
11068
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11069
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
11070
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
11071
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
11072
 
11073
  vector signed int vsumi0 = v0;
11074
  vector signed int vsumi1 = v0;
 
11175
  __m256 accumf = (__m256)__lasx_xvldi(0);
11176
  for (int i = 0; i < nb; ++i) {
11177
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
11178
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
11179
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11180
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
11181
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
11182
  __m256i sumi1 = __lasx_xvldi(0);
11183
  __m256i sumi2 = __lasx_xvldi(0);
11184
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
 
11236
  float sumf = 0.f;
11237
  for (int i = 0; i < nb; ++i) {
11238
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
11239
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
11240
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11241
+ const uint8_t * GGML_RESTRICT signs = x[i].signs;
11242
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
11243
  int32_t bsum = 0;
11244
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
11245
  const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
 
11291
  }
11292
  #endif
11293
 
11294
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
11295
  assert(n % QK_K == 0);
11296
  assert(nrc == 1);
11297
  UNUSED(nrc);
 
11299
  UNUSED(by);
11300
  UNUSED(bs);
11301
 
11302
+ const block_iq1_s * GGML_RESTRICT x = vx;
11303
+ const block_q8_K * GGML_RESTRICT y = vy;
11304
 
11305
  const int nb = n / QK_K;
11306
 
 
11458
  vector signed int vsumi3 = vec_splats((int32_t)0);
11459
  vector signed int vsumi8 = vec_splats((int32_t)0);
11460
 
11461
+ const uint8_t * GGML_RESTRICT q1 = x[i].qs;
11462
+ const uint16_t * GGML_RESTRICT qh = x[i].qh;
11463
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
11464
+ const int16_t * GGML_RESTRICT qs = y[i].bsums;
11465
 
11466
  for (int j = 0; j < QK_K/32; j += 2) {
11467
  __builtin_prefetch(q1, 0, 1);
 
11622
  #endif
11623
  }
11624
 
11625
+ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
11626
  assert(n % QK_K == 0);
11627
  assert(nrc == 1);
11628
  UNUSED(nrc);
 
11630
  UNUSED(by);
11631
  UNUSED(bs);
11632
 
11633
+ const block_iq1_m * GGML_RESTRICT x = vx;
11634
+ const block_q8_K * GGML_RESTRICT y = vy;
11635
 
11636
  const int nb = n / QK_K;
11637
 
 
11912
  #endif
11913
  }
11914
 
11915
+ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
11916
  assert(nrc == 1);
11917
  UNUSED(nrc);
11918
  UNUSED(bx);
 
11921
  assert(n % QK4_NL == 0);
11922
  static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
11923
 
11924
+ const block_iq4_nl * GGML_RESTRICT x = vx;
11925
+ const block_q8_0 * GGML_RESTRICT y = vy;
11926
 
11927
  const int nb = n / QK4_NL;
11928
 
 
12097
  const uint8x16_t v_m = vec_splat_u8(0x0F);
12098
 
12099
  for (; ib < nb; ++ib) {
12100
+ const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
12101
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
12102
 
12103
  const uint8x16_t v_x = vec_xl(0, x0->qs);
12104
  int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
 
12126
  *s = sumf;
12127
  }
12128
 
12129
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
12130
  assert(nrc == 1);
12131
  UNUSED(nrc);
12132
  UNUSED(bx);
 
12134
  UNUSED(bs);
12135
  assert(n % QK_K == 0);
12136
 
12137
+ const block_iq4_xs * GGML_RESTRICT x = vx;
12138
+ const block_q8_K * GGML_RESTRICT y = vy;
12139
 
12140
  const int nb = n / QK_K;
12141
 
 
12292
 
12293
  uint16_t h = x[ibl].scales_h;
12294
 
12295
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
12296
+ const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
12297
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
12298
 
12299
  for (int ib = 0; ib < QK_K/64; ib ++ ) {
12300
  __builtin_prefetch(q4, 0, 1);
 
12398
  float sumf = 0;
12399
 
12400
  for (int ibl = 0; ibl < nb; ++ibl) {
12401
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
12402
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
12403
 
12404
  uint16_t h = x[ibl].scales_h;
12405
 
 
12479
 
12480
  // ============================ 4-bit non-linear quants
12481
 
12482
+ void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
12483
  assert(k % QK4_NL == 0);
12484
  quantize_row_iq4_nl_ref(x, y, k);
12485
  }
12486
 
12487
+ void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
12488
  assert(k % QK_K == 0);
12489
  quantize_iq4_xs(x, y, 1, k, NULL);
12490
  }
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -247,9 +247,9 @@ typedef pthread_t ggml_thread_t;
247
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
248
 
249
 
250
- static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
251
- static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
252
- static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
253
 
254
  static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
255
  [GGML_TYPE_F32] = {
@@ -1451,7 +1451,7 @@ inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp
1451
  }
1452
  }
1453
 
1454
- static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
1455
  assert(nrc == 1);
1456
  UNUSED(nrc);
1457
  UNUSED(bx);
@@ -1494,7 +1494,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
1494
  *s = sumf;
1495
  }
1496
 
1497
- static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
1498
  assert(nrc == 1);
1499
  UNUSED(nrc);
1500
  UNUSED(bx);
@@ -1562,7 +1562,7 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
1562
  *s = sumf;
1563
  }
1564
 
1565
- static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
1566
  assert(nrc == 1);
1567
  UNUSED(nrc);
1568
  UNUSED(bx);
@@ -1606,10 +1606,10 @@ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t *
1606
 
1607
  // compute GGML_VEC_DOT_UNROLL dot products at once
1608
  // xs - x row stride in bytes
1609
- inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
1610
  ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
1611
 
1612
- ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
1613
 
1614
  for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
1615
  x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@@ -1659,7 +1659,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
1659
  }
1660
  }
1661
 
1662
- inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
1663
  #if defined(GGML_SIMD)
1664
  const int np = (n & ~(GGML_F32_STEP - 1));
1665
 
@@ -1690,7 +1690,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
1690
  #endif
1691
  }
1692
 
1693
- inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
1694
  #if defined(GGML_SIMD)
1695
  const int np = (n & ~(GGML_F16_STEP - 1));
1696
 
@@ -1722,10 +1722,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const
1722
  }
1723
 
1724
  // xs and vs are byte strides of x and v
1725
- inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
1726
 
1727
- const float * restrict x[GGML_VEC_MAD_UNROLL];
1728
- const float * restrict v[GGML_VEC_MAD_UNROLL];
1729
 
1730
  for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
1731
  x[i] = (const float *) ((const char *) xv + i*xs);
 
247
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
248
 
249
 
250
+ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
251
+ static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
252
+ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
253
 
254
  static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
255
  [GGML_TYPE_F32] = {
 
1451
  }
1452
  }
1453
 
1454
+ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
1455
  assert(nrc == 1);
1456
  UNUSED(nrc);
1457
  UNUSED(bx);
 
1494
  *s = sumf;
1495
  }
1496
 
1497
+ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
1498
  assert(nrc == 1);
1499
  UNUSED(nrc);
1500
  UNUSED(bx);
 
1562
  *s = sumf;
1563
  }
1564
 
1565
+ static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
1566
  assert(nrc == 1);
1567
  UNUSED(nrc);
1568
  UNUSED(bx);
 
1606
 
1607
  // compute GGML_VEC_DOT_UNROLL dot products at once
1608
  // xs - x row stride in bytes
1609
+ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
1610
  ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
1611
 
1612
+ ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
1613
 
1614
  for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
1615
  x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
 
1659
  }
1660
  }
1661
 
1662
+ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
1663
  #if defined(GGML_SIMD)
1664
  const int np = (n & ~(GGML_F32_STEP - 1));
1665
 
 
1690
  #endif
1691
  }
1692
 
1693
+ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
1694
  #if defined(GGML_SIMD)
1695
  const int np = (n & ~(GGML_F16_STEP - 1));
1696
 
 
1722
  }
1723
 
1724
  // xs and vs are byte strides of x and v
1725
+ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
1726
 
1727
+ const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
1728
+ const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
1729
 
1730
  for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
1731
  x[i] = (const float *) ((const char *) xv + i*xs);
ggml/src/ggml-quants.c CHANGED
@@ -28,7 +28,7 @@
28
  #define UNUSED GGML_UNUSED
29
 
30
  // reference implementation for deterministic creation of model files
31
- void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
32
  static const int qk = QK4_0;
33
 
34
  assert(k % qk == 0);
@@ -65,7 +65,7 @@ void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, in
65
  }
66
  }
67
 
68
- void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
69
  const int qk = QK4_1;
70
 
71
  assert(k % qk == 0);
@@ -102,7 +102,7 @@ void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, in
102
  }
103
  }
104
 
105
- void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
106
  static const int qk = QK5_0;
107
 
108
  assert(k % qk == 0);
@@ -146,7 +146,7 @@ void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, in
146
  }
147
  }
148
 
149
- void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
150
  const int qk = QK5_1;
151
 
152
  assert(k % qk == 0);
@@ -191,7 +191,7 @@ void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, in
191
  }
192
 
193
  // reference implementation for deterministic creation of model files
194
- void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
195
  assert(k % QK8_0 == 0);
196
  const int nb = k / QK8_0;
197
 
@@ -217,7 +217,7 @@ void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, in
217
  }
218
 
219
  // reference implementation for deterministic creation of model files
220
- void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
221
  assert(QK8_1 == 32);
222
  assert(k % QK8_1 == 0);
223
  const int nb = k / QK8_1;
@@ -252,7 +252,7 @@ void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, in
252
  }
253
  }
254
 
255
- void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
256
  static const int qk = QK4_0;
257
 
258
  assert(k % qk == 0);
@@ -272,7 +272,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int6
272
  }
273
  }
274
 
275
- void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
276
  static const int qk = QK4_1;
277
 
278
  assert(k % qk == 0);
@@ -293,7 +293,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int6
293
  }
294
  }
295
 
296
- void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
297
  static const int qk = QK5_0;
298
 
299
  assert(k % qk == 0);
@@ -319,7 +319,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int6
319
  }
320
  }
321
 
322
- void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
323
  static const int qk = QK5_1;
324
 
325
  assert(k % qk == 0);
@@ -346,7 +346,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6
346
  }
347
  }
348
 
349
- void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
350
  static const int qk = QK8_0;
351
 
352
  assert(k % qk == 0);
@@ -376,8 +376,8 @@ static inline int nearest_int(float fval) {
376
  return (i & 0x007fffff) - 0x00400000;
377
  }
378
 
379
- static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
380
- const float * restrict qw) {
381
  float max = 0;
382
  float amax = 0;
383
  for (int i = 0; i < n; ++i) {
@@ -445,7 +445,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
445
  return scale;
446
  }
447
 
448
- static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
449
  float max = 0;
450
  float amax = 0;
451
  for (int i = 0; i < n; ++i) {
@@ -504,7 +504,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
504
  return 1/iscale;
505
  }
506
 
507
- static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
508
  int ntry, float alpha) {
509
  float min = x[0];
510
  float max = x[0];
@@ -547,8 +547,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
547
  return scale;
548
  }
549
 
550
- static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
551
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
552
  float rmin, float rdelta, int nstep, bool use_mad) {
553
  float min = x[0];
554
  float max = x[0];
@@ -628,7 +628,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
628
  return scale;
629
  }
630
 
631
- static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
632
  if (j < 4) {
633
  *d = q[j] & 63; *m = q[j + 4] & 63;
634
  } else {
@@ -639,7 +639,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
639
 
640
  //========================- 2-bit (de)-quantization
641
 
642
- void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
643
  assert(k % QK_K == 0);
644
  const int nb = k / QK_K;
645
 
@@ -709,7 +709,7 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in
709
  }
710
  }
711
 
712
- void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
713
  assert(k % QK_K == 0);
714
  const int nb = k / QK_K;
715
 
@@ -741,8 +741,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
741
  }
742
  }
743
 
744
- static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
745
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
746
  float rmin, float rdelta, int nstep, bool use_mad) {
747
  float min = x[0];
748
  float max = x[0];
@@ -824,7 +824,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
824
  return scale;
825
  }
826
 
827
- static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
828
  float max = 0;
829
  for (int i = 0; i < n; ++i) {
830
  max = MAX(max, x[i]);
@@ -897,7 +897,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
897
  return sumlx/suml2;
898
  }
899
 
900
- static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
901
  GGML_ASSERT(quant_weights);
902
  assert(k % QK_K == 0);
903
  const int nb = k / QK_K;
@@ -917,7 +917,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
917
  for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
918
  float sigma2 = sumx2/QK_K;
919
  for (int j = 0; j < QK_K/16; ++j) {
920
- const float * restrict qw = quant_weights + QK_K * i + 16*j;
921
  for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
922
  for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
923
  scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
@@ -959,7 +959,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
959
  }
960
  }
961
 
962
- size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
963
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
964
  if (!quant_weights) {
965
  quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -977,7 +977,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
977
 
978
  //========================= 3-bit (de)-quantization
979
 
980
- void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
981
  assert(k % QK_K == 0);
982
  const int nb = k / QK_K;
983
 
@@ -1053,7 +1053,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
1053
  }
1054
  }
1055
 
1056
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
1057
  assert(k % QK_K == 0);
1058
  const int nb = k / QK_K;
1059
 
@@ -1067,8 +1067,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
1067
 
1068
  const float d_all = GGML_FP16_TO_FP32(x[i].d);
1069
 
1070
- const uint8_t * restrict q = x[i].qs;
1071
- const uint8_t * restrict hm = x[i].hmask;
1072
  uint8_t m = 1;
1073
 
1074
  memcpy(aux, x[i].scales, 12);
@@ -1103,7 +1103,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
1103
  }
1104
  }
1105
 
1106
- static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
1107
  assert(n_per_row % QK_K == 0);
1108
  const int nb = n_per_row / QK_K;
1109
 
@@ -1187,7 +1187,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
1187
  }
1188
  }
1189
 
1190
- size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1191
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
1192
  if (!quant_weights) {
1193
  quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1205,7 +1205,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
1205
 
1206
  // ====================== 4-bit (de)-quantization
1207
 
1208
- void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
1209
  assert(k % QK_K == 0);
1210
  const int nb = k / QK_K;
1211
 
@@ -1277,7 +1277,7 @@ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, in
1277
  }
1278
  }
1279
 
1280
- void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
1281
  assert(k % QK_K == 0);
1282
  const int nb = k / QK_K;
1283
 
@@ -1301,7 +1301,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
1301
  }
1302
  }
1303
 
1304
- static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
1305
  assert(n_per_row % QK_K == 0);
1306
  const int64_t nb = n_per_row / QK_K;
1307
 
@@ -1374,7 +1374,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
1374
  }
1375
  }
1376
 
1377
- size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1378
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
1379
  if (!quant_weights) {
1380
  quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1392,7 +1392,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
1392
 
1393
  // ====================== 5-bit (de)-quantization
1394
 
1395
- void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
1396
  assert(k % QK_K == 0);
1397
  const int64_t nb = k / QK_K;
1398
 
@@ -1454,8 +1454,8 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
1454
  }
1455
  }
1456
 
1457
- uint8_t * restrict qh = y[i].qh;
1458
- uint8_t * restrict ql = y[i].qs;
1459
  memset(qh, 0, QK_K/8);
1460
 
1461
  uint8_t m1 = 1, m2 = 2;
@@ -1479,7 +1479,7 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
1479
  }
1480
  }
1481
 
1482
- void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
1483
  assert(k % QK_K == 0);
1484
  const int64_t nb = k / QK_K;
1485
 
@@ -1506,7 +1506,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
1506
  }
1507
  }
1508
 
1509
- static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
1510
  assert(n_per_row % QK_K == 0);
1511
  const int64_t nb = n_per_row / QK_K;
1512
 
@@ -1573,8 +1573,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
1573
  }
1574
  }
1575
 
1576
- uint8_t * restrict qh = y[i].qh;
1577
- uint8_t * restrict ql = y[i].qs;
1578
  memset(qh, 0, QK_K/8);
1579
 
1580
  uint8_t m1 = 1, m2 = 2;
@@ -1599,7 +1599,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
1599
  }
1600
  }
1601
 
1602
- size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1603
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
1604
  if (!quant_weights) {
1605
  quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1617,7 +1617,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
1617
 
1618
  // ====================== 6-bit (de)-quantization
1619
 
1620
- void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
1621
  assert(k % QK_K == 0);
1622
  const int64_t nb = k / QK_K;
1623
 
@@ -1667,8 +1667,8 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
1667
  }
1668
  }
1669
 
1670
- uint8_t * restrict ql = y[i].ql;
1671
- uint8_t * restrict qh = y[i].qh;
1672
  for (int j = 0; j < QK_K; j += 128) {
1673
  for (int l = 0; l < 32; ++l) {
1674
  const uint8_t q1 = L[j + l + 0] & 0xF;
@@ -1687,16 +1687,16 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
1687
  }
1688
  }
1689
 
1690
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
1691
  assert(k % QK_K == 0);
1692
  const int64_t nb = k / QK_K;
1693
 
1694
  for (int i = 0; i < nb; i++) {
1695
  const float d = GGML_FP16_TO_FP32(x[i].d);
1696
 
1697
- const uint8_t * restrict ql = x[i].ql;
1698
- const uint8_t * restrict qh = x[i].qh;
1699
- const int8_t * restrict sc = x[i].scales;
1700
 
1701
  for (int n = 0; n < QK_K; n += 128) {
1702
  for (int l = 0; l < 32; ++l) {
@@ -1718,7 +1718,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
1718
  }
1719
  }
1720
 
1721
- static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
1722
  assert(n_per_row % QK_K == 0);
1723
  const int64_t nb = n_per_row / QK_K;
1724
 
@@ -1781,8 +1781,8 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
1781
  }
1782
  }
1783
 
1784
- uint8_t * restrict ql = y[i].ql;
1785
- uint8_t * restrict qh = y[i].qh;
1786
  for (int j = 0; j < QK_K; j += 128) {
1787
  for (int l = 0; l < 32; ++l) {
1788
  const uint8_t q1 = L[j + l + 0] & 0xF;
@@ -1802,7 +1802,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
1802
  }
1803
  }
1804
 
1805
- size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1806
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
1807
  if (!quant_weights) {
1808
  quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1818,7 +1818,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr
1818
  return nrow * row_size;
1819
  }
1820
 
1821
- static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
1822
  static_assert(QK4_0 == 32, "QK4_0 must be 32");
1823
 
1824
  if (!quant_weights) {
@@ -1846,7 +1846,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
1846
  }
1847
  }
1848
 
1849
- size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1850
  if (!quant_weights) {
1851
  quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
1852
  return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
@@ -1861,7 +1861,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr
1861
  return nrow * row_size;
1862
  }
1863
 
1864
- static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
1865
  static_assert(QK4_1 == 32, "QK4_1 must be 32");
1866
 
1867
  if (!quant_weights) {
@@ -1891,7 +1891,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
1891
  }
1892
  }
1893
 
1894
- size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1895
  if (!quant_weights) {
1896
  quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
1897
  return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
@@ -1906,7 +1906,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nr
1906
  return nrow * row_size;
1907
  }
1908
 
1909
- static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
1910
  static_assert(QK5_0 == 32, "QK5_0 must be 32");
1911
 
1912
  if (!quant_weights) {
@@ -1945,7 +1945,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
1945
  }
1946
  }
1947
 
1948
- size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1949
  if (!quant_weights) {
1950
  quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
1951
  return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
@@ -1960,7 +1960,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nr
1960
  return nrow * row_size;
1961
  }
1962
 
1963
- static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
1964
  static_assert(QK5_1 == 32, "QK5_1 must be 32");
1965
 
1966
  if (!quant_weights) {
@@ -1998,7 +1998,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
1998
  }
1999
  }
2000
 
2001
- size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2002
  if (!quant_weights) {
2003
  quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
2004
  return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
@@ -2013,7 +2013,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
2013
  return nrow * row_size;
2014
  }
2015
 
2016
- size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2017
  (void)quant_weights; // not used
2018
  const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
2019
  quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -2022,7 +2022,7 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
2022
 
2023
  // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
2024
 
2025
- void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
2026
  assert(k % QK_K == 0);
2027
  const int64_t nb = k / QK_K;
2028
 
@@ -2088,7 +2088,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
2088
  }
2089
  }
2090
 
2091
- void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
2092
  assert(k % QK_K == 0);
2093
  const int64_t nb = k / QK_K;
2094
 
@@ -2120,21 +2120,21 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
2120
  }
2121
  }
2122
 
2123
- size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2124
  (void)quant_weights; // not used
2125
  const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
2126
  quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
2127
  return nrow * row_size;
2128
  }
2129
 
2130
- size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2131
  (void)quant_weights; // not used
2132
  const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
2133
  quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
2134
  return nrow * row_size;
2135
  }
2136
 
2137
- void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
2138
  assert(k % QK_K == 0);
2139
  const int64_t nb = k / QK_K;
2140
 
@@ -2173,7 +2173,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in
2173
  }
2174
  }
2175
 
2176
- void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) {
2177
  assert(k % QK_K == 0);
2178
  const int64_t nb = k / QK_K;
2179
 
@@ -2194,7 +2194,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in
2194
 
2195
  // ====================== "True" 2-bit (de)-quantization
2196
 
2197
- void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
2198
  assert(k % QK_K == 0);
2199
  const int64_t nb = k / QK_K;
2200
 
@@ -2222,7 +2222,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
2222
 
2223
  // ====================== 2.3125 bpw (de)-quantization
2224
 
2225
- void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
2226
  assert(k % QK_K == 0);
2227
  const int64_t nb = k / QK_K;
2228
 
@@ -2249,7 +2249,7 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
2249
 
2250
  // ====================== 2.5625 bpw (de)-quantization
2251
 
2252
- void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
2253
  assert(k % QK_K == 0);
2254
  const int64_t nb = k / QK_K;
2255
 
@@ -2281,7 +2281,7 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
2281
 
2282
  // ====================== 3.0625 bpw (de)-quantization
2283
 
2284
- void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
2285
  assert(k % QK_K == 0);
2286
  const int64_t nb = k / QK_K;
2287
 
@@ -2313,7 +2313,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
2313
 
2314
  // ====================== 3.3125 bpw (de)-quantization
2315
 
2316
- void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
2317
  assert(k % QK_K == 0);
2318
  const int64_t nb = k / QK_K;
2319
 
@@ -2356,7 +2356,7 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
2356
 
2357
  // ====================== 1.5625 bpw (de)-quantization
2358
 
2359
- void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
2360
  assert(k % QK_K == 0);
2361
  const int64_t nb = k / QK_K;
2362
 
@@ -2381,7 +2381,7 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
2381
  }
2382
  }
2383
 
2384
- void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
2385
  assert(k % QK_K == 0);
2386
  const int64_t nb = k / QK_K;
2387
 
@@ -2433,7 +2433,7 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
2433
 
2434
  static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
2435
 
2436
- void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
2437
  assert(k % QK4_NL == 0);
2438
  const int64_t nb = k / QK4_NL;
2439
 
@@ -2451,7 +2451,7 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
2451
  }
2452
  }
2453
 
2454
- void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
2455
  assert(k % QK_K == 0);
2456
  const int64_t nb = k / QK_K;
2457
 
@@ -2476,7 +2476,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
2476
 
2477
  //===================================== Q8_K ==============================================
2478
 
2479
- void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
2480
  assert(k % QK_K == 0);
2481
  const int64_t nb = k / QK_K;
2482
 
@@ -2515,7 +2515,7 @@ void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, in
2515
  }
2516
  }
2517
 
2518
- void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
2519
  assert(k % QK_K == 0);
2520
  const int64_t nb = k / QK_K;
2521
 
@@ -2927,8 +2927,8 @@ void iq2xs_free_impl(enum ggml_type type) {
2927
  }
2928
  }
2929
 
2930
- static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
2931
- const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
2932
  int num_neighbors = neighbours[0];
2933
  GGML_ASSERT(num_neighbors > 0);
2934
  float best_d2 = FLT_MAX;
@@ -2951,7 +2951,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
2951
  return grid_index;
2952
  }
2953
 
2954
- static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
2955
 
2956
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
2957
 
@@ -3124,7 +3124,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
3124
  }
3125
  }
3126
 
3127
- static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
3128
 
3129
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
3130
 
@@ -3304,7 +3304,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
3304
  }
3305
  }
3306
 
3307
- size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3308
  GGML_ASSERT(n_per_row%QK_K == 0);
3309
  int64_t nblock = n_per_row/QK_K;
3310
  char * qrow = (char *)dst;
@@ -3316,7 +3316,7 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t
3316
  return nrow * nblock * sizeof(block_iq2_xxs);
3317
  }
3318
 
3319
- size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3320
  GGML_ASSERT(n_per_row%QK_K == 0);
3321
  int64_t nblock = n_per_row/QK_K;
3322
  char * qrow = (char *)dst;
@@ -3521,8 +3521,8 @@ void iq3xs_free_impl(int grid_size) {
3521
  }
3522
  }
3523
 
3524
- static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
3525
- const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
3526
  int num_neighbors = neighbours[0];
3527
  GGML_ASSERT(num_neighbors > 0);
3528
  float best_d2 = FLT_MAX;
@@ -3545,8 +3545,8 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
3545
  return grid_index;
3546
  }
3547
 
3548
- static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
3549
- const float * restrict quant_weights) {
3550
 
3551
  const int gindex = iq3_data_index(grid_size);
3552
 
@@ -3758,7 +3758,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
3758
  }
3759
  }
3760
 
3761
- size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3762
  GGML_ASSERT(n_per_row%QK_K == 0);
3763
  int64_t nblock = n_per_row/QK_K;
3764
  char * qrow = (char *)dst;
@@ -3770,13 +3770,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
3770
  return nrow * nblock * sizeof(block_iq3_xxs);
3771
  }
3772
 
3773
- void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
3774
  assert(k % QK_K == 0);
3775
  quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
3776
  }
3777
 
3778
- static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
3779
- const float * restrict quant_weights,
3780
  float * scales,
3781
  float * weight,
3782
  float * xval,
@@ -3958,7 +3958,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
3958
  }
3959
 
3960
  #define IQ3S_BLOCK_SIZE 32
3961
- size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3962
  GGML_ASSERT(n_per_row%QK_K == 0);
3963
  int64_t nblock = n_per_row/QK_K;
3964
  float scales[QK_K/IQ3S_BLOCK_SIZE];
@@ -3980,7 +3980,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
3980
  return nrow * nblock * sizeof(block_iq3_s);
3981
  }
3982
 
3983
- void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
3984
  assert(k % QK_K == 0);
3985
  quantize_iq3_s(x, y, 1, k, NULL);
3986
  }
@@ -3988,8 +3988,8 @@ void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y,
3988
 
3989
  // =================================== 1.5 bpw ===================================================
3990
 
3991
- static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
3992
- const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
3993
  int num_neighbors = neighbours[0];
3994
  GGML_ASSERT(num_neighbors > 0);
3995
  float best_score = -FLT_MAX;
@@ -4048,8 +4048,8 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
4048
  return grid_index;
4049
  }
4050
 
4051
- static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
4052
- const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
4053
  int num_neighbors = neighbours[0];
4054
  GGML_ASSERT(num_neighbors > 0);
4055
  float best_score = FLT_MAX;
@@ -4113,7 +4113,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
4113
 
4114
  #define IQ1S_BLOCK_SIZE 32
4115
  #define IQ1M_BLOCK_SIZE 16
4116
- static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
4117
  float * scales,
4118
  float * weight,
4119
  float * sumx,
@@ -4271,7 +4271,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
4271
  }
4272
  }
4273
 
4274
- size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4275
  GGML_ASSERT(n_per_row%QK_K == 0);
4276
  float scales[QK_K/IQ1S_BLOCK_SIZE];
4277
  float weight[IQ1S_BLOCK_SIZE];
@@ -4291,7 +4291,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n
4291
  return nrow * nblock * sizeof(block_iq1_s);
4292
  }
4293
 
4294
- static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
4295
  float * scales,
4296
  float * weight,
4297
  float * pairs,
@@ -4539,7 +4539,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
4539
  }
4540
  }
4541
 
4542
- size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4543
  GGML_ASSERT(n_per_row%QK_K == 0);
4544
  float scales[QK_K/IQ1M_BLOCK_SIZE];
4545
  float weight[IQ1M_BLOCK_SIZE];
@@ -4570,7 +4570,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
4570
  return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
4571
  }
4572
 
4573
- static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
4574
  ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
4575
  float * scales, float * weight, uint8_t * L,
4576
  const int8_t * values,
@@ -4681,7 +4681,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
4681
  }
4682
  }
4683
 
4684
- size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4685
  GGML_ASSERT(n_per_row%QK4_NL == 0);
4686
  int64_t nblock = n_per_row/QK4_NL;
4687
  char * qrow = (char *)dst;
@@ -4703,8 +4703,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
4703
  return nrow * nblock * sizeof(block_iq4_nl);
4704
  }
4705
 
4706
- //void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) {
4707
- void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
4708
  GGML_ASSERT(k%QK4_NL == 0);
4709
  int64_t nblock = k/QK4_NL;
4710
  uint8_t L[QK4_NL];
@@ -4719,7 +4719,7 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
4719
  }
4720
  }
4721
 
4722
- size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4723
  GGML_ASSERT(n_per_row%QK_K == 0);
4724
  int64_t nblock = n_per_row/QK_K;
4725
  char * qrow = (char *)dst;
@@ -4739,14 +4739,14 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
4739
  return nrow * nblock * sizeof(block_iq4_xs);
4740
  }
4741
 
4742
- void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
4743
  assert(k % QK_K == 0);
4744
  quantize_iq4_xs(x, y, 1, k, NULL);
4745
  }
4746
 
4747
  // =============================== 2.5625 bpw
4748
 
4749
- static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
4750
 
4751
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
4752
 
@@ -4914,7 +4914,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
4914
  }
4915
  }
4916
 
4917
- size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4918
  GGML_ASSERT(n_per_row%QK_K == 0);
4919
  int64_t nblock = n_per_row/QK_K;
4920
  char * qrow = (char *)dst;
@@ -4926,7 +4926,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
4926
  return nrow * nblock * sizeof(block_iq2_s);
4927
  }
4928
 
4929
- void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
4930
  assert(k % QK_K == 0);
4931
  quantize_iq2_s(x, y, 1, k, NULL);
4932
  }
 
28
  #define UNUSED GGML_UNUSED
29
 
30
  // reference implementation for deterministic creation of model files
31
+ void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) {
32
  static const int qk = QK4_0;
33
 
34
  assert(k % qk == 0);
 
65
  }
66
  }
67
 
68
+ void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k) {
69
  const int qk = QK4_1;
70
 
71
  assert(k % qk == 0);
 
102
  }
103
  }
104
 
105
+ void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k) {
106
  static const int qk = QK5_0;
107
 
108
  assert(k % qk == 0);
 
146
  }
147
  }
148
 
149
+ void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k) {
150
  const int qk = QK5_1;
151
 
152
  assert(k % qk == 0);
 
191
  }
192
 
193
  // reference implementation for deterministic creation of model files
194
+ void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k) {
195
  assert(k % QK8_0 == 0);
196
  const int nb = k / QK8_0;
197
 
 
217
  }
218
 
219
  // reference implementation for deterministic creation of model files
220
+ void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k) {
221
  assert(QK8_1 == 32);
222
  assert(k % QK8_1 == 0);
223
  const int nb = k / QK8_1;
 
252
  }
253
  }
254
 
255
+ void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
256
  static const int qk = QK4_0;
257
 
258
  assert(k % qk == 0);
 
272
  }
273
  }
274
 
275
+ void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
276
  static const int qk = QK4_1;
277
 
278
  assert(k % qk == 0);
 
293
  }
294
  }
295
 
296
+ void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
297
  static const int qk = QK5_0;
298
 
299
  assert(k % qk == 0);
 
319
  }
320
  }
321
 
322
+ void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
323
  static const int qk = QK5_1;
324
 
325
  assert(k % qk == 0);
 
346
  }
347
  }
348
 
349
+ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
350
  static const int qk = QK8_0;
351
 
352
  assert(k % qk == 0);
 
376
  return (i & 0x007fffff) - 0x00400000;
377
  }
378
 
379
+ static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
380
+ const float * GGML_RESTRICT qw) {
381
  float max = 0;
382
  float amax = 0;
383
  for (int i = 0; i < n; ++i) {
 
445
  return scale;
446
  }
447
 
448
+ static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
449
  float max = 0;
450
  float amax = 0;
451
  for (int i = 0; i < n; ++i) {
 
504
  return 1/iscale;
505
  }
506
 
507
+ static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
508
  int ntry, float alpha) {
509
  float min = x[0];
510
  float max = x[0];
 
547
  return scale;
548
  }
549
 
550
+ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
551
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
552
  float rmin, float rdelta, int nstep, bool use_mad) {
553
  float min = x[0];
554
  float max = x[0];
 
628
  return scale;
629
  }
630
 
631
+ static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
632
  if (j < 4) {
633
  *d = q[j] & 63; *m = q[j + 4] & 63;
634
  } else {
 
639
 
640
  //========================- 2-bit (de)-quantization
641
 
642
+ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) {
643
  assert(k % QK_K == 0);
644
  const int nb = k / QK_K;
645
 
 
709
  }
710
  }
711
 
712
+ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
713
  assert(k % QK_K == 0);
714
  const int nb = k / QK_K;
715
 
 
741
  }
742
  }
743
 
744
+ static float make_qkx3_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
745
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
746
  float rmin, float rdelta, int nstep, bool use_mad) {
747
  float min = x[0];
748
  float max = x[0];
 
824
  return scale;
825
  }
826
 
827
+ static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, const float * quant_weights) {
828
  float max = 0;
829
  for (int i = 0; i < n; ++i) {
830
  max = MAX(max, x[i]);
 
897
  return sumlx/suml2;
898
  }
899
 
900
+ static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) {
901
  GGML_ASSERT(quant_weights);
902
  assert(k % QK_K == 0);
903
  const int nb = k / QK_K;
 
917
  for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
918
  float sigma2 = sumx2/QK_K;
919
  for (int j = 0; j < QK_K/16; ++j) {
920
+ const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16*j;
921
  for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
922
  for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
923
  scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
 
959
  }
960
  }
961
 
962
+ size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
963
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
964
  if (!quant_weights) {
965
  quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
 
977
 
978
  //========================= 3-bit (de)-quantization
979
 
980
+ void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k) {
981
  assert(k % QK_K == 0);
982
  const int nb = k / QK_K;
983
 
 
1053
  }
1054
  }
1055
 
1056
+ void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
1057
  assert(k % QK_K == 0);
1058
  const int nb = k / QK_K;
1059
 
 
1067
 
1068
  const float d_all = GGML_FP16_TO_FP32(x[i].d);
1069
 
1070
+ const uint8_t * GGML_RESTRICT q = x[i].qs;
1071
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
1072
  uint8_t m = 1;
1073
 
1074
  memcpy(aux, x[i].scales, 12);
 
1103
  }
1104
  }
1105
 
1106
+ static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t n_per_row, const float * GGML_RESTRICT quant_weights) {
1107
  assert(n_per_row % QK_K == 0);
1108
  const int nb = n_per_row / QK_K;
1109
 
 
1187
  }
1188
  }
1189
 
1190
+ size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1191
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
1192
  if (!quant_weights) {
1193
  quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
 
1205
 
1206
  // ====================== 4-bit (de)-quantization
1207
 
1208
+ void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k) {
1209
  assert(k % QK_K == 0);
1210
  const int nb = k / QK_K;
1211
 
 
1277
  }
1278
  }
1279
 
1280
+ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
1281
  assert(k % QK_K == 0);
1282
  const int nb = k / QK_K;
1283
 
 
1301
  }
1302
  }
1303
 
1304
+ static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1305
  assert(n_per_row % QK_K == 0);
1306
  const int64_t nb = n_per_row / QK_K;
1307
 
 
1374
  }
1375
  }
1376
 
1377
+ size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1378
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
1379
  if (!quant_weights) {
1380
  quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
 
1392
 
1393
  // ====================== 5-bit (de)-quantization
1394
 
1395
+ void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k) {
1396
  assert(k % QK_K == 0);
1397
  const int64_t nb = k / QK_K;
1398
 
 
1454
  }
1455
  }
1456
 
1457
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
1458
+ uint8_t * GGML_RESTRICT ql = y[i].qs;
1459
  memset(qh, 0, QK_K/8);
1460
 
1461
  uint8_t m1 = 1, m2 = 2;
 
1479
  }
1480
  }
1481
 
1482
+ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
1483
  assert(k % QK_K == 0);
1484
  const int64_t nb = k / QK_K;
1485
 
 
1506
  }
1507
  }
1508
 
1509
+ static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1510
  assert(n_per_row % QK_K == 0);
1511
  const int64_t nb = n_per_row / QK_K;
1512
 
 
1573
  }
1574
  }
1575
 
1576
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
1577
+ uint8_t * GGML_RESTRICT ql = y[i].qs;
1578
  memset(qh, 0, QK_K/8);
1579
 
1580
  uint8_t m1 = 1, m2 = 2;
 
1599
  }
1600
  }
1601
 
1602
+ size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1603
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
1604
  if (!quant_weights) {
1605
  quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
 
1617
 
1618
  // ====================== 6-bit (de)-quantization
1619
 
1620
+ void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) {
1621
  assert(k % QK_K == 0);
1622
  const int64_t nb = k / QK_K;
1623
 
 
1667
  }
1668
  }
1669
 
1670
+ uint8_t * GGML_RESTRICT ql = y[i].ql;
1671
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
1672
  for (int j = 0; j < QK_K; j += 128) {
1673
  for (int l = 0; l < 32; ++l) {
1674
  const uint8_t q1 = L[j + l + 0] & 0xF;
 
1687
  }
1688
  }
1689
 
1690
+ void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
1691
  assert(k % QK_K == 0);
1692
  const int64_t nb = k / QK_K;
1693
 
1694
  for (int i = 0; i < nb; i++) {
1695
  const float d = GGML_FP16_TO_FP32(x[i].d);
1696
 
1697
+ const uint8_t * GGML_RESTRICT ql = x[i].ql;
1698
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
1699
+ const int8_t * GGML_RESTRICT sc = x[i].scales;
1700
 
1701
  for (int n = 0; n < QK_K; n += 128) {
1702
  for (int l = 0; l < 32; ++l) {
 
1718
  }
1719
  }
1720
 
1721
+ static void quantize_row_q6_K_impl(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1722
  assert(n_per_row % QK_K == 0);
1723
  const int64_t nb = n_per_row / QK_K;
1724
 
 
1781
  }
1782
  }
1783
 
1784
+ uint8_t * GGML_RESTRICT ql = y[i].ql;
1785
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
1786
  for (int j = 0; j < QK_K; j += 128) {
1787
  for (int l = 0; l < 32; ++l) {
1788
  const uint8_t q1 = L[j + l + 0] & 0xF;
 
1802
  }
1803
  }
1804
 
1805
+ size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1806
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
1807
  if (!quant_weights) {
1808
  quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
 
1818
  return nrow * row_size;
1819
  }
1820
 
1821
+ static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1822
  static_assert(QK4_0 == 32, "QK4_0 must be 32");
1823
 
1824
  if (!quant_weights) {
 
1846
  }
1847
  }
1848
 
1849
+ size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1850
  if (!quant_weights) {
1851
  quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
1852
  return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
 
1861
  return nrow * row_size;
1862
  }
1863
 
1864
+ static void quantize_row_q4_1_impl(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1865
  static_assert(QK4_1 == 32, "QK4_1 must be 32");
1866
 
1867
  if (!quant_weights) {
 
1891
  }
1892
  }
1893
 
1894
+ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1895
  if (!quant_weights) {
1896
  quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
1897
  return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
 
1906
  return nrow * row_size;
1907
  }
1908
 
1909
+ static void quantize_row_q5_0_impl(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1910
  static_assert(QK5_0 == 32, "QK5_0 must be 32");
1911
 
1912
  if (!quant_weights) {
 
1945
  }
1946
  }
1947
 
1948
+ size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1949
  if (!quant_weights) {
1950
  quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
1951
  return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
 
1960
  return nrow * row_size;
1961
  }
1962
 
1963
+ static void quantize_row_q5_1_impl(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1964
  static_assert(QK5_1 == 32, "QK5_1 must be 32");
1965
 
1966
  if (!quant_weights) {
 
1998
  }
1999
  }
2000
 
2001
+ size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2002
  if (!quant_weights) {
2003
  quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
2004
  return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
 
2013
  return nrow * row_size;
2014
  }
2015
 
2016
+ size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2017
  (void)quant_weights; // not used
2018
  const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
2019
  quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
 
2022
 
2023
  // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
2024
 
2025
+ void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
2026
  assert(k % QK_K == 0);
2027
  const int64_t nb = k / QK_K;
2028
 
 
2088
  }
2089
  }
2090
 
2091
+ void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k) {
2092
  assert(k % QK_K == 0);
2093
  const int64_t nb = k / QK_K;
2094
 
 
2120
  }
2121
  }
2122
 
2123
+ size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2124
  (void)quant_weights; // not used
2125
  const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
2126
  quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
2127
  return nrow * row_size;
2128
  }
2129
 
2130
+ size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2131
  (void)quant_weights; // not used
2132
  const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
2133
  quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
2134
  return nrow * row_size;
2135
  }
2136
 
2137
+ void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2138
  assert(k % QK_K == 0);
2139
  const int64_t nb = k / QK_K;
2140
 
 
2173
  }
2174
  }
2175
 
2176
+ void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2177
  assert(k % QK_K == 0);
2178
  const int64_t nb = k / QK_K;
2179
 
 
2194
 
2195
  // ====================== "True" 2-bit (de)-quantization
2196
 
2197
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2198
  assert(k % QK_K == 0);
2199
  const int64_t nb = k / QK_K;
2200
 
 
2222
 
2223
  // ====================== 2.3125 bpw (de)-quantization
2224
 
2225
+ void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2226
  assert(k % QK_K == 0);
2227
  const int64_t nb = k / QK_K;
2228
 
 
2249
 
2250
  // ====================== 2.5625 bpw (de)-quantization
2251
 
2252
+ void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2253
  assert(k % QK_K == 0);
2254
  const int64_t nb = k / QK_K;
2255
 
 
2281
 
2282
  // ====================== 3.0625 bpw (de)-quantization
2283
 
2284
+ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2285
  assert(k % QK_K == 0);
2286
  const int64_t nb = k / QK_K;
2287
 
 
2313
 
2314
  // ====================== 3.3125 bpw (de)-quantization
2315
 
2316
+ void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2317
  assert(k % QK_K == 0);
2318
  const int64_t nb = k / QK_K;
2319
 
 
2356
 
2357
  // ====================== 1.5625 bpw (de)-quantization
2358
 
2359
+ void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2360
  assert(k % QK_K == 0);
2361
  const int64_t nb = k / QK_K;
2362
 
 
2381
  }
2382
  }
2383
 
2384
+ void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2385
  assert(k % QK_K == 0);
2386
  const int64_t nb = k / QK_K;
2387
 
 
2433
 
2434
  static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
2435
 
2436
+ void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2437
  assert(k % QK4_NL == 0);
2438
  const int64_t nb = k / QK4_NL;
2439
 
 
2451
  }
2452
  }
2453
 
2454
+ void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2455
  assert(k % QK_K == 0);
2456
  const int64_t nb = k / QK_K;
2457
 
 
2476
 
2477
  //===================================== Q8_K ==============================================
2478
 
2479
+ void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k) {
2480
  assert(k % QK_K == 0);
2481
  const int64_t nb = k / QK_K;
2482
 
 
2515
  }
2516
  }
2517
 
2518
+ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2519
  assert(k % QK_K == 0);
2520
  const int64_t nb = k / QK_K;
2521
 
 
2927
  }
2928
  }
2929
 
2930
+ static int iq2_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
2931
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
2932
  int num_neighbors = neighbours[0];
2933
  GGML_ASSERT(num_neighbors > 0);
2934
  float best_d2 = FLT_MAX;
 
2951
  return grid_index;
2952
  }
2953
 
2954
+ static void quantize_row_iq2_xxs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
2955
 
2956
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
2957
 
 
3124
  }
3125
  }
3126
 
3127
+ static void quantize_row_iq2_xs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
3128
 
3129
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
3130
 
 
3304
  }
3305
  }
3306
 
3307
+ size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3308
  GGML_ASSERT(n_per_row%QK_K == 0);
3309
  int64_t nblock = n_per_row/QK_K;
3310
  char * qrow = (char *)dst;
 
3316
  return nrow * nblock * sizeof(block_iq2_xxs);
3317
  }
3318
 
3319
+ size_t quantize_iq2_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3320
  GGML_ASSERT(n_per_row%QK_K == 0);
3321
  int64_t nblock = n_per_row/QK_K;
3322
  char * qrow = (char *)dst;
 
3521
  }
3522
  }
3523
 
3524
+ static int iq3_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint32_t * GGML_RESTRICT grid,
3525
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
3526
  int num_neighbors = neighbours[0];
3527
  GGML_ASSERT(num_neighbors > 0);
3528
  float best_d2 = FLT_MAX;
 
3545
  return grid_index;
3546
  }
3547
 
3548
+ static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n,
3549
+ const float * GGML_RESTRICT quant_weights) {
3550
 
3551
  const int gindex = iq3_data_index(grid_size);
3552
 
 
3758
  }
3759
  }
3760
 
3761
+ size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3762
  GGML_ASSERT(n_per_row%QK_K == 0);
3763
  int64_t nblock = n_per_row/QK_K;
3764
  char * qrow = (char *)dst;
 
3770
  return nrow * nblock * sizeof(block_iq3_xxs);
3771
  }
3772
 
3773
+ void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k) {
3774
  assert(k % QK_K == 0);
3775
  quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
3776
  }
3777
 
3778
+ static void quantize_row_iq3_s_impl(int block_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int n,
3779
+ const float * GGML_RESTRICT quant_weights,
3780
  float * scales,
3781
  float * weight,
3782
  float * xval,
 
3958
  }
3959
 
3960
  #define IQ3S_BLOCK_SIZE 32
3961
+ size_t quantize_iq3_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3962
  GGML_ASSERT(n_per_row%QK_K == 0);
3963
  int64_t nblock = n_per_row/QK_K;
3964
  float scales[QK_K/IQ3S_BLOCK_SIZE];
 
3980
  return nrow * nblock * sizeof(block_iq3_s);
3981
  }
3982
 
3983
+ void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k) {
3984
  assert(k % QK_K == 0);
3985
  quantize_iq3_s(x, y, 1, k, NULL);
3986
  }
 
3988
 
3989
  // =================================== 1.5 bpw ===================================================
3990
 
3991
+ static int iq1_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
3992
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float * scale, int8_t * GGML_RESTRICT L, int ngrid) {
3993
  int num_neighbors = neighbours[0];
3994
  GGML_ASSERT(num_neighbors > 0);
3995
  float best_score = -FLT_MAX;
 
4048
  return grid_index;
4049
  }
4050
 
4051
+ static int iq1_find_best_neighbour2(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
4052
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, const float * GGML_RESTRICT xg, int8_t * GGML_RESTRICT L, int ngrid) {
4053
  int num_neighbors = neighbours[0];
4054
  GGML_ASSERT(num_neighbors > 0);
4055
  float best_score = FLT_MAX;
 
4113
 
4114
  #define IQ1S_BLOCK_SIZE 32
4115
  #define IQ1M_BLOCK_SIZE 16
4116
+ static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
4117
  float * scales,
4118
  float * weight,
4119
  float * sumx,
 
4271
  }
4272
  }
4273
 
4274
+ size_t quantize_iq1_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4275
  GGML_ASSERT(n_per_row%QK_K == 0);
4276
  float scales[QK_K/IQ1S_BLOCK_SIZE];
4277
  float weight[IQ1S_BLOCK_SIZE];
 
4291
  return nrow * nblock * sizeof(block_iq1_s);
4292
  }
4293
 
4294
+ static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
4295
  float * scales,
4296
  float * weight,
4297
  float * pairs,
 
4539
  }
4540
  }
4541
 
4542
+ size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4543
  GGML_ASSERT(n_per_row%QK_K == 0);
4544
  float scales[QK_K/IQ1M_BLOCK_SIZE];
4545
  float weight[IQ1M_BLOCK_SIZE];
 
4570
  return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
4571
  }
4572
 
4573
+ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
4574
  ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
4575
  float * scales, float * weight, uint8_t * L,
4576
  const int8_t * values,
 
4681
  }
4682
  }
4683
 
4684
+ size_t quantize_iq4_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4685
  GGML_ASSERT(n_per_row%QK4_NL == 0);
4686
  int64_t nblock = n_per_row/QK4_NL;
4687
  char * qrow = (char *)dst;
 
4703
  return nrow * nblock * sizeof(block_iq4_nl);
4704
  }
4705
 
4706
+ //void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
4707
+ void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k) {
4708
  GGML_ASSERT(k%QK4_NL == 0);
4709
  int64_t nblock = k/QK4_NL;
4710
  uint8_t L[QK4_NL];
 
4719
  }
4720
  }
4721
 
4722
+ size_t quantize_iq4_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4723
  GGML_ASSERT(n_per_row%QK_K == 0);
4724
  int64_t nblock = n_per_row/QK_K;
4725
  char * qrow = (char *)dst;
 
4739
  return nrow * nblock * sizeof(block_iq4_xs);
4740
  }
4741
 
4742
+ void quantize_row_iq4_xs_ref(const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k) {
4743
  assert(k % QK_K == 0);
4744
  quantize_iq4_xs(x, y, 1, k, NULL);
4745
  }
4746
 
4747
  // =============================== 2.5625 bpw
4748
 
4749
+ static void quantize_row_iq2_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
4750
 
4751
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
4752
 
 
4914
  }
4915
  }
4916
 
4917
+ size_t quantize_iq2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4918
  GGML_ASSERT(n_per_row%QK_K == 0);
4919
  int64_t nblock = n_per_row/QK_K;
4920
  char * qrow = (char *)dst;
 
4926
  return nrow * nblock * sizeof(block_iq2_s);
4927
  }
4928
 
4929
+ void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k) {
4930
  assert(k % QK_K == 0);
4931
  quantize_iq2_s(x, y, 1, k, NULL);
4932
  }
ggml/src/ggml.c CHANGED
@@ -565,9 +565,9 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
565
  #endif
566
 
567
  }
568
- static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
569
- static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
570
- static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
571
 
572
  static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
573
  [GGML_TYPE_I8] = {
 
565
  #endif
566
 
567
  }
568
+ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
569
+ static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
570
+ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
571
 
572
  static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
573
  [GGML_TYPE_I8] = {