Spaces:
Running
Running
opencl : sync opencl compilation fix in ggml (#1111)
Browse files- ggml-opencl.cpp +6 -6
ggml-opencl.cpp
CHANGED
|
@@ -653,13 +653,13 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
| 653 |
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
| 654 |
const int in = tid - step*im; // 0...15 or 0...7
|
| 655 |
|
| 656 |
-
#if K_QUANTS_PER_ITERATION == 1
|
| 657 |
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
| 658 |
const int is = 0;
|
| 659 |
-
#else
|
| 660 |
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
| 661 |
const int is = in / 4;
|
| 662 |
-
#endif
|
| 663 |
const int ql_offset = 64*im + l0;
|
| 664 |
const int qh_offset = 32*im + l0;
|
| 665 |
const int s_offset = 8*im + is;
|
|
@@ -676,7 +676,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
| 676 |
|
| 677 |
const float d = vload_half(0, &x[i].d);
|
| 678 |
|
| 679 |
-
#if K_QUANTS_PER_ITERATION == 1
|
| 680 |
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
| 681 |
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
| 682 |
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
|
@@ -686,7 +686,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
| 686 |
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
| 687 |
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
| 688 |
tmp[16 * ix + tid] += sum;
|
| 689 |
-
#else
|
| 690 |
float sum = 0;
|
| 691 |
for (int l = 0; l < 4; ++l) {
|
| 692 |
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
|
@@ -695,7 +695,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
| 695 |
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
| 696 |
}
|
| 697 |
tmp[16 * ix + tid] += sum;
|
| 698 |
-
#endif
|
| 699 |
|
| 700 |
}
|
| 701 |
|
|
|
|
| 653 |
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
| 654 |
const int in = tid - step*im; // 0...15 or 0...7
|
| 655 |
|
| 656 |
+
\n#if K_QUANTS_PER_ITERATION == 1\n
|
| 657 |
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
| 658 |
const int is = 0;
|
| 659 |
+
\n#else\n
|
| 660 |
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
| 661 |
const int is = in / 4;
|
| 662 |
+
\n#endif\n
|
| 663 |
const int ql_offset = 64*im + l0;
|
| 664 |
const int qh_offset = 32*im + l0;
|
| 665 |
const int s_offset = 8*im + is;
|
|
|
|
| 676 |
|
| 677 |
const float d = vload_half(0, &x[i].d);
|
| 678 |
|
| 679 |
+
\n#if K_QUANTS_PER_ITERATION == 1\n
|
| 680 |
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
| 681 |
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
| 682 |
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
|
|
|
| 686 |
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
| 687 |
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
| 688 |
tmp[16 * ix + tid] += sum;
|
| 689 |
+
\n#else\n
|
| 690 |
float sum = 0;
|
| 691 |
for (int l = 0; l < 4; ++l) {
|
| 692 |
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
|
|
|
| 695 |
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
| 696 |
}
|
| 697 |
tmp[16 * ix + tid] += sum;
|
| 698 |
+
\n#endif\n
|
| 699 |
|
| 700 |
}
|
| 701 |
|