slaren commited on
Commit
b57bcbc
·
1 Parent(s): a6d9f2d

ggml : fix quants nans when all the group weights are very close to zero (llama/7313)

Browse files
Files changed (1) hide show
  1. ggml-quants.c +18 -12
ggml-quants.c CHANGED
@@ -14,6 +14,12 @@
14
  #include <stdlib.h> // for qsort
15
  #include <stdio.h> // for GGML_ASSERT
16
 
 
 
 
 
 
 
17
  #if defined(_MSC_VER)
18
  // disable "possible loss of data" to avoid warnings for hundreds of casts
19
  // we should just be careful :)
@@ -1109,7 +1115,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1109
  float ax = fabsf(x[i]);
1110
  if (ax > amax) { amax = ax; max = x[i]; }
1111
  }
1112
- if (amax < 1e-30f) { // all zero
1113
  for (int i = 0; i < n; ++i) {
1114
  L[i] = 0;
1115
  }
@@ -1177,7 +1183,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
1177
  float ax = fabsf(x[i]);
1178
  if (ax > amax) { amax = ax; max = x[i]; }
1179
  }
1180
- if (!amax) { // all zero
1181
  for (int i = 0; i < n; ++i) { L[i] = 0; }
1182
  return 0.f;
1183
  }
@@ -1646,7 +1652,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
1646
  break;
1647
  }
1648
  }
1649
- return sumlx / suml2;
1650
  }
1651
 
1652
  static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
@@ -2653,7 +2659,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2653
 
2654
  }
2655
 
2656
- if (!max_abs_scale) {
2657
  memset(&y[i], 0, sizeof(block_q6_K));
2658
  y[i].d = GGML_FP32_TO_FP16(0.f);
2659
  x += QK_K;
@@ -2805,7 +2811,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
2805
 
2806
  }
2807
 
2808
- if (!max_abs_scale) {
2809
  memset(&y[i], 0, sizeof(block_q6_K));
2810
  y[i].d = GGML_FP32_TO_FP16(0.f);
2811
  x += QK_K;
@@ -12599,7 +12605,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
12599
  }
12600
  float max = xval[0];
12601
  for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
12602
- if (!max) {
12603
  scales[ib] = 0;
12604
  memset(L, 0, 32);
12605
  continue;
@@ -12775,7 +12781,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
12775
  }
12776
  float max = xval[0];
12777
  for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
12778
- if (!max) {
12779
  scales[ib] = 0;
12780
  memset(L, 0, 16);
12781
  continue;
@@ -13216,7 +13222,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
13216
  }
13217
  float max = xval[0];
13218
  for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
13219
- if (!max) {
13220
  scales[ib] = 0;
13221
  memset(L, 0, 32);
13222
  continue;
@@ -13756,7 +13762,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
13756
  for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
13757
  float max = fabsf(xb[0]);
13758
  for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
13759
- if (!max) {
13760
  scales[ib] = 0;
13761
  memset(L, 1, block_size);
13762
  continue;
@@ -13944,7 +13950,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
13944
  }
13945
  float max = fabsf(xb[0]);
13946
  for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
13947
- if (!max) {
13948
  scales[ib] = 0;
13949
  memset(L, 1, block_size);
13950
  continue;
@@ -14208,7 +14214,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
14208
  amax = ax; max = xb[j];
14209
  }
14210
  }
14211
- if (!amax) {
14212
  scales[ib] = 0;
14213
  continue;
14214
  }
@@ -14429,7 +14435,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
14429
  }
14430
  float max = xval[0];
14431
  for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
14432
- if (!max) {
14433
  scales[ib] = 0;
14434
  continue;
14435
  }
 
14
  #include <stdlib.h> // for qsort
15
  #include <stdio.h> // for GGML_ASSERT
16
 
17
+ #define GROUP_MAX_EPS 1e-15f
18
+ #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
19
+ #define GROUP_MAX_EPS_IQ2_S 1e-8f
20
+ #define GROUP_MAX_EPS_IQ1_M 1e-7f
21
+ #define GROUP_MAX_EPS_IQ1_S 1e-12f
22
+
23
  #if defined(_MSC_VER)
24
  // disable "possible loss of data" to avoid warnings for hundreds of casts
25
  // we should just be careful :)
 
1115
  float ax = fabsf(x[i]);
1116
  if (ax > amax) { amax = ax; max = x[i]; }
1117
  }
1118
+ if (amax < GROUP_MAX_EPS) { // all zero
1119
  for (int i = 0; i < n; ++i) {
1120
  L[i] = 0;
1121
  }
 
1183
  float ax = fabsf(x[i]);
1184
  if (ax > amax) { amax = ax; max = x[i]; }
1185
  }
1186
+ if (amax < GROUP_MAX_EPS) { // all zero
1187
  for (int i = 0; i < n; ++i) { L[i] = 0; }
1188
  return 0.f;
1189
  }
 
1652
  break;
1653
  }
1654
  }
1655
+ return sumlx/suml2;
1656
  }
1657
 
1658
  static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
 
2659
 
2660
  }
2661
 
2662
+ if (max_abs_scale < GROUP_MAX_EPS) {
2663
  memset(&y[i], 0, sizeof(block_q6_K));
2664
  y[i].d = GGML_FP32_TO_FP16(0.f);
2665
  x += QK_K;
 
2811
 
2812
  }
2813
 
2814
+ if (max_abs_scale < GROUP_MAX_EPS) {
2815
  memset(&y[i], 0, sizeof(block_q6_K));
2816
  y[i].d = GGML_FP32_TO_FP16(0.f);
2817
  x += QK_K;
 
12605
  }
12606
  float max = xval[0];
12607
  for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
12608
+ if (max < GROUP_MAX_EPS) {
12609
  scales[ib] = 0;
12610
  memset(L, 0, 32);
12611
  continue;
 
12781
  }
12782
  float max = xval[0];
12783
  for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
12784
+ if (max < GROUP_MAX_EPS) {
12785
  scales[ib] = 0;
12786
  memset(L, 0, 16);
12787
  continue;
 
13222
  }
13223
  float max = xval[0];
13224
  for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
13225
+ if (max < GROUP_MAX_EPS_IQ3_XXS) {
13226
  scales[ib] = 0;
13227
  memset(L, 0, 32);
13228
  continue;
 
13762
  for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
13763
  float max = fabsf(xb[0]);
13764
  for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
13765
+ if (max < GROUP_MAX_EPS_IQ1_S) {
13766
  scales[ib] = 0;
13767
  memset(L, 1, block_size);
13768
  continue;
 
13950
  }
13951
  float max = fabsf(xb[0]);
13952
  for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
13953
+ if (max < GROUP_MAX_EPS_IQ1_M) {
13954
  scales[ib] = 0;
13955
  memset(L, 1, block_size);
13956
  continue;
 
14214
  amax = ax; max = xb[j];
14215
  }
14216
  }
14217
+ if (amax < GROUP_MAX_EPS) {
14218
  scales[ib] = 0;
14219
  continue;
14220
  }
 
14435
  }
14436
  float max = xval[0];
14437
  for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
14438
+ if (max < GROUP_MAX_EPS_IQ2_S) {
14439
  scales[ib] = 0;
14440
  continue;
14441
  }