Spaces:
Running
Running
Engininja2
commited on
ggml-quants : fix avx2 iq1_s vec_dot when compiled with gcc (llama/5742)
Browse files- ggml-quants.c +9 -5
ggml-quants.c
CHANGED
|
@@ -10248,8 +10248,12 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
| 10248 |
|
| 10249 |
uint64_t aux64;
|
| 10250 |
|
| 10251 |
-
|
| 10252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10253 |
|
| 10254 |
__m256 accum = _mm256_setzero_ps();
|
| 10255 |
for (int i = 0; i < nb; ++i) {
|
|
@@ -10264,13 +10268,13 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
| 10264 |
memcpy(&aux64, sc, 8); sc += 8;
|
| 10265 |
const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
|
| 10266 |
const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
|
| 10267 |
-
v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
| 10268 |
const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
|
| 10269 |
|
| 10270 |
for (int i32 = 0; i32 < 4; ++i32) {
|
| 10271 |
const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
| 10272 |
-
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[
|
| 10273 |
-
iq1s_grid[
|
| 10274 |
const __m256i dot = mul_add_epi8(q1b, q8b);
|
| 10275 |
const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
|
| 10276 |
const __m256i p = _mm256_madd_epi16(s16, dot);
|
|
|
|
| 10248 |
|
| 10249 |
uint64_t aux64;
|
| 10250 |
|
| 10251 |
+
typedef union m256i_uint16 {
|
| 10252 |
+
__m256i reg;
|
| 10253 |
+
uint16_t s[16];
|
| 10254 |
+
} m256i_uint16_t;
|
| 10255 |
+
|
| 10256 |
+
m256i_uint16_t v_gindex;
|
| 10257 |
|
| 10258 |
__m256 accum = _mm256_setzero_ps();
|
| 10259 |
for (int i = 0; i < nb; ++i) {
|
|
|
|
| 10268 |
memcpy(&aux64, sc, 8); sc += 8;
|
| 10269 |
const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
|
| 10270 |
const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
|
| 10271 |
+
v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
| 10272 |
const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
|
| 10273 |
|
| 10274 |
for (int i32 = 0; i32 < 4; ++i32) {
|
| 10275 |
const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
| 10276 |
+
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
|
| 10277 |
+
iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
|
| 10278 |
const __m256i dot = mul_add_epi8(q1b, q8b);
|
| 10279 |
const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
|
| 10280 |
const __m256i p = _mm256_madd_epi16(s16, dot);
|