Spaces:
Running
Running
slaren
commited on
Commit
·
b57bcbc
1
Parent(s):
a6d9f2d
ggml : fix quants nans when all the group weights are very close to zero (llama/7313)
Browse files- ggml-quants.c +18 -12
ggml-quants.c
CHANGED
|
@@ -14,6 +14,12 @@
|
|
| 14 |
#include <stdlib.h> // for qsort
|
| 15 |
#include <stdio.h> // for GGML_ASSERT
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
#if defined(_MSC_VER)
|
| 18 |
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
| 19 |
// we should just be careful :)
|
|
@@ -1109,7 +1115,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
| 1109 |
float ax = fabsf(x[i]);
|
| 1110 |
if (ax > amax) { amax = ax; max = x[i]; }
|
| 1111 |
}
|
| 1112 |
-
if (amax <
|
| 1113 |
for (int i = 0; i < n; ++i) {
|
| 1114 |
L[i] = 0;
|
| 1115 |
}
|
|
@@ -1177,7 +1183,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
| 1177 |
float ax = fabsf(x[i]);
|
| 1178 |
if (ax > amax) { amax = ax; max = x[i]; }
|
| 1179 |
}
|
| 1180 |
-
if (
|
| 1181 |
for (int i = 0; i < n; ++i) { L[i] = 0; }
|
| 1182 |
return 0.f;
|
| 1183 |
}
|
|
@@ -1646,7 +1652,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
|
|
| 1646 |
break;
|
| 1647 |
}
|
| 1648 |
}
|
| 1649 |
-
return sumlx
|
| 1650 |
}
|
| 1651 |
|
| 1652 |
static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
|
@@ -2653,7 +2659,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
| 2653 |
|
| 2654 |
}
|
| 2655 |
|
| 2656 |
-
if (
|
| 2657 |
memset(&y[i], 0, sizeof(block_q6_K));
|
| 2658 |
y[i].d = GGML_FP32_TO_FP16(0.f);
|
| 2659 |
x += QK_K;
|
|
@@ -2805,7 +2811,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
| 2805 |
|
| 2806 |
}
|
| 2807 |
|
| 2808 |
-
if (
|
| 2809 |
memset(&y[i], 0, sizeof(block_q6_K));
|
| 2810 |
y[i].d = GGML_FP32_TO_FP16(0.f);
|
| 2811 |
x += QK_K;
|
|
@@ -12599,7 +12605,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
| 12599 |
}
|
| 12600 |
float max = xval[0];
|
| 12601 |
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
| 12602 |
-
if (
|
| 12603 |
scales[ib] = 0;
|
| 12604 |
memset(L, 0, 32);
|
| 12605 |
continue;
|
|
@@ -12775,7 +12781,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
| 12775 |
}
|
| 12776 |
float max = xval[0];
|
| 12777 |
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
| 12778 |
-
if (
|
| 12779 |
scales[ib] = 0;
|
| 12780 |
memset(L, 0, 16);
|
| 12781 |
continue;
|
|
@@ -13216,7 +13222,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|
| 13216 |
}
|
| 13217 |
float max = xval[0];
|
| 13218 |
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
| 13219 |
-
if (
|
| 13220 |
scales[ib] = 0;
|
| 13221 |
memset(L, 0, 32);
|
| 13222 |
continue;
|
|
@@ -13756,7 +13762,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
| 13756 |
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
| 13757 |
float max = fabsf(xb[0]);
|
| 13758 |
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
| 13759 |
-
if (
|
| 13760 |
scales[ib] = 0;
|
| 13761 |
memset(L, 1, block_size);
|
| 13762 |
continue;
|
|
@@ -13944,7 +13950,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
|
| 13944 |
}
|
| 13945 |
float max = fabsf(xb[0]);
|
| 13946 |
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
| 13947 |
-
if (
|
| 13948 |
scales[ib] = 0;
|
| 13949 |
memset(L, 1, block_size);
|
| 13950 |
continue;
|
|
@@ -14208,7 +14214,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
| 14208 |
amax = ax; max = xb[j];
|
| 14209 |
}
|
| 14210 |
}
|
| 14211 |
-
if (
|
| 14212 |
scales[ib] = 0;
|
| 14213 |
continue;
|
| 14214 |
}
|
|
@@ -14429,7 +14435,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|
| 14429 |
}
|
| 14430 |
float max = xval[0];
|
| 14431 |
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
| 14432 |
-
if (
|
| 14433 |
scales[ib] = 0;
|
| 14434 |
continue;
|
| 14435 |
}
|
|
|
|
| 14 |
#include <stdlib.h> // for qsort
|
| 15 |
#include <stdio.h> // for GGML_ASSERT
|
| 16 |
|
| 17 |
+
#define GROUP_MAX_EPS 1e-15f
|
| 18 |
+
#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
|
| 19 |
+
#define GROUP_MAX_EPS_IQ2_S 1e-8f
|
| 20 |
+
#define GROUP_MAX_EPS_IQ1_M 1e-7f
|
| 21 |
+
#define GROUP_MAX_EPS_IQ1_S 1e-12f
|
| 22 |
+
|
| 23 |
#if defined(_MSC_VER)
|
| 24 |
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
| 25 |
// we should just be careful :)
|
|
|
|
| 1115 |
float ax = fabsf(x[i]);
|
| 1116 |
if (ax > amax) { amax = ax; max = x[i]; }
|
| 1117 |
}
|
| 1118 |
+
if (amax < GROUP_MAX_EPS) { // all zero
|
| 1119 |
for (int i = 0; i < n; ++i) {
|
| 1120 |
L[i] = 0;
|
| 1121 |
}
|
|
|
|
| 1183 |
float ax = fabsf(x[i]);
|
| 1184 |
if (ax > amax) { amax = ax; max = x[i]; }
|
| 1185 |
}
|
| 1186 |
+
if (amax < GROUP_MAX_EPS) { // all zero
|
| 1187 |
for (int i = 0; i < n; ++i) { L[i] = 0; }
|
| 1188 |
return 0.f;
|
| 1189 |
}
|
|
|
|
| 1652 |
break;
|
| 1653 |
}
|
| 1654 |
}
|
| 1655 |
+
return sumlx/suml2;
|
| 1656 |
}
|
| 1657 |
|
| 1658 |
static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
|
|
|
| 2659 |
|
| 2660 |
}
|
| 2661 |
|
| 2662 |
+
if (max_abs_scale < GROUP_MAX_EPS) {
|
| 2663 |
memset(&y[i], 0, sizeof(block_q6_K));
|
| 2664 |
y[i].d = GGML_FP32_TO_FP16(0.f);
|
| 2665 |
x += QK_K;
|
|
|
|
| 2811 |
|
| 2812 |
}
|
| 2813 |
|
| 2814 |
+
if (max_abs_scale < GROUP_MAX_EPS) {
|
| 2815 |
memset(&y[i], 0, sizeof(block_q6_K));
|
| 2816 |
y[i].d = GGML_FP32_TO_FP16(0.f);
|
| 2817 |
x += QK_K;
|
|
|
|
| 12605 |
}
|
| 12606 |
float max = xval[0];
|
| 12607 |
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
| 12608 |
+
if (max < GROUP_MAX_EPS) {
|
| 12609 |
scales[ib] = 0;
|
| 12610 |
memset(L, 0, 32);
|
| 12611 |
continue;
|
|
|
|
| 12781 |
}
|
| 12782 |
float max = xval[0];
|
| 12783 |
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
| 12784 |
+
if (max < GROUP_MAX_EPS) {
|
| 12785 |
scales[ib] = 0;
|
| 12786 |
memset(L, 0, 16);
|
| 12787 |
continue;
|
|
|
|
| 13222 |
}
|
| 13223 |
float max = xval[0];
|
| 13224 |
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
| 13225 |
+
if (max < GROUP_MAX_EPS_IQ3_XXS) {
|
| 13226 |
scales[ib] = 0;
|
| 13227 |
memset(L, 0, 32);
|
| 13228 |
continue;
|
|
|
|
| 13762 |
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
| 13763 |
float max = fabsf(xb[0]);
|
| 13764 |
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
| 13765 |
+
if (max < GROUP_MAX_EPS_IQ1_S) {
|
| 13766 |
scales[ib] = 0;
|
| 13767 |
memset(L, 1, block_size);
|
| 13768 |
continue;
|
|
|
|
| 13950 |
}
|
| 13951 |
float max = fabsf(xb[0]);
|
| 13952 |
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
| 13953 |
+
if (max < GROUP_MAX_EPS_IQ1_M) {
|
| 13954 |
scales[ib] = 0;
|
| 13955 |
memset(L, 1, block_size);
|
| 13956 |
continue;
|
|
|
|
| 14214 |
amax = ax; max = xb[j];
|
| 14215 |
}
|
| 14216 |
}
|
| 14217 |
+
if (amax < GROUP_MAX_EPS) {
|
| 14218 |
scales[ib] = 0;
|
| 14219 |
continue;
|
| 14220 |
}
|
|
|
|
| 14435 |
}
|
| 14436 |
float max = xval[0];
|
| 14437 |
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
| 14438 |
+
if (max < GROUP_MAX_EPS_IQ2_S) {
|
| 14439 |
scales[ib] = 0;
|
| 14440 |
continue;
|
| 14441 |
}
|