Spaces:
Running
Running
ggml-quants : fix compiler warnings (shadow variable) (llama/5472)
Browse files- ggml-quants.c +18 -18
ggml-quants.c
CHANGED
|
@@ -3819,15 +3819,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 3819 |
/* Compute combined scale for the block */
|
| 3820 |
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
| 3821 |
|
| 3822 |
-
__m256i
|
| 3823 |
|
| 3824 |
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
| 3825 |
const __m256i off = _mm256_set1_epi8( 8 );
|
| 3826 |
-
|
| 3827 |
|
| 3828 |
-
__m256i
|
| 3829 |
|
| 3830 |
-
const __m256 q = mul_sum_i8_pairs_float(
|
| 3831 |
|
| 3832 |
/* Multiply q with scale and accumulate */
|
| 3833 |
acc = _mm256_fmadd_ps( d, q, acc );
|
|
@@ -4196,10 +4196,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4196 |
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
| 4197 |
|
| 4198 |
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
| 4199 |
-
const __m256i
|
| 4200 |
-
const __m256i
|
| 4201 |
|
| 4202 |
-
const __m256 xy = mul_sum_us8_pairs_float(
|
| 4203 |
|
| 4204 |
// Accumulate d0*d1*x*y
|
| 4205 |
#if defined(__AVX2__)
|
|
@@ -4418,14 +4418,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 4418 |
/* Compute combined scale for the block */
|
| 4419 |
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
| 4420 |
|
| 4421 |
-
__m256i
|
| 4422 |
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
| 4423 |
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
| 4424 |
-
|
| 4425 |
|
| 4426 |
-
__m256i
|
| 4427 |
|
| 4428 |
-
const __m256 q = mul_sum_i8_pairs_float(
|
| 4429 |
|
| 4430 |
/* Multiply q with scale and accumulate */
|
| 4431 |
acc = _mm256_fmadd_ps(d, q, acc);
|
|
@@ -4722,15 +4722,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4722 |
|
| 4723 |
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
| 4724 |
|
| 4725 |
-
__m256i
|
| 4726 |
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
| 4727 |
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
| 4728 |
-
|
| 4729 |
|
| 4730 |
const __m256 dy = _mm256_set1_ps(y[i].d);
|
| 4731 |
-
const __m256i
|
| 4732 |
|
| 4733 |
-
const __m256 q = mul_sum_us8_pairs_float(
|
| 4734 |
|
| 4735 |
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
| 4736 |
}
|
|
@@ -4973,10 +4973,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 4973 |
for (int i = 0; i < nb; ++i) {
|
| 4974 |
// Compute combined scale for the block
|
| 4975 |
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
| 4976 |
-
__m256i
|
| 4977 |
-
__m256i
|
| 4978 |
|
| 4979 |
-
const __m256 q = mul_sum_i8_pairs_float(
|
| 4980 |
|
| 4981 |
// Multiply q with scale and accumulate
|
| 4982 |
#if defined(__AVX2__)
|
|
|
|
| 3819 |
/* Compute combined scale for the block */
|
| 3820 |
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
| 3821 |
|
| 3822 |
+
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
| 3823 |
|
| 3824 |
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
| 3825 |
const __m256i off = _mm256_set1_epi8( 8 );
|
| 3826 |
+
qx = _mm256_sub_epi8( qx, off );
|
| 3827 |
|
| 3828 |
+
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 3829 |
|
| 3830 |
+
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
| 3831 |
|
| 3832 |
/* Multiply q with scale and accumulate */
|
| 3833 |
acc = _mm256_fmadd_ps( d, q, acc );
|
|
|
|
| 4196 |
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
| 4197 |
|
| 4198 |
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
| 4199 |
+
const __m256i qx = bytes_from_nibbles_32(x[i].qs);
|
| 4200 |
+
const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
| 4201 |
|
| 4202 |
+
const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
|
| 4203 |
|
| 4204 |
// Accumulate d0*d1*x*y
|
| 4205 |
#if defined(__AVX2__)
|
|
|
|
| 4418 |
/* Compute combined scale for the block */
|
| 4419 |
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
| 4420 |
|
| 4421 |
+
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
| 4422 |
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
| 4423 |
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
| 4424 |
+
qx = _mm256_or_si256(qx, bxhi);
|
| 4425 |
|
| 4426 |
+
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 4427 |
|
| 4428 |
+
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
| 4429 |
|
| 4430 |
/* Multiply q with scale and accumulate */
|
| 4431 |
acc = _mm256_fmadd_ps(d, q, acc);
|
|
|
|
| 4722 |
|
| 4723 |
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
| 4724 |
|
| 4725 |
+
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
| 4726 |
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
| 4727 |
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
| 4728 |
+
qx = _mm256_or_si256(qx, bxhi);
|
| 4729 |
|
| 4730 |
const __m256 dy = _mm256_set1_ps(y[i].d);
|
| 4731 |
+
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 4732 |
|
| 4733 |
+
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
| 4734 |
|
| 4735 |
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
| 4736 |
}
|
|
|
|
| 4973 |
for (int i = 0; i < nb; ++i) {
|
| 4974 |
// Compute combined scale for the block
|
| 4975 |
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
| 4976 |
+
__m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
| 4977 |
+
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 4978 |
|
| 4979 |
+
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
| 4980 |
|
| 4981 |
// Multiply q with scale and accumulate
|
| 4982 |
#if defined(__AVX2__)
|