Spaces:
Sleeping
Sleeping
opencl : remove obsolete files (skip) (ggml/1200)
Browse files- ggml/src/ggml-opencl/kernels/ggml-opencl.cl +0 -0
- ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl +0 -106
- ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl +0 -268
- ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl +0 -274
- ggml/src/ggml-opencl/kernels/ggml-opencl_im2col.cl +0 -146
- ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl +0 -1225
- ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl +0 -139
- ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl +0 -26
- ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl +0 -25
- ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl +0 -35
ggml/src/ggml-opencl/kernels/ggml-opencl.cl
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
DELETED
|
@@ -1,106 +0,0 @@
|
|
| 1 |
-
//------------------------------------------------------------------------------
|
| 2 |
-
// This file is contains additional kernels for data conversion.
|
| 3 |
-
// These kernels are used when loading the model, so its performance is less
|
| 4 |
-
// important.
|
| 5 |
-
//------------------------------------------------------------------------------
|
| 6 |
-
#ifdef cl_khr_fp16
|
| 7 |
-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 8 |
-
#elif defined(cl_amd_fp16)
|
| 9 |
-
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
|
| 10 |
-
#else
|
| 11 |
-
#error "Half precision floating point not supportedby OpenCL implementation on your device."
|
| 12 |
-
#endif
|
| 13 |
-
|
| 14 |
-
#ifdef cl_khr_subgroups
|
| 15 |
-
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 16 |
-
#elif defined(cl_intel_subgroups)
|
| 17 |
-
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
| 18 |
-
#else
|
| 19 |
-
#error "Subgroup not supported on your device."
|
| 20 |
-
#endif
|
| 21 |
-
|
| 22 |
-
#ifdef cl_intel_required_subgroup_size
|
| 23 |
-
// Always use subgroup size of 32 on Intel.
|
| 24 |
-
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
| 25 |
-
#define INTEL_GPU 1
|
| 26 |
-
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
| 27 |
-
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
| 28 |
-
#elif defined(cl_qcom_reqd_sub_group_size)
|
| 29 |
-
// Always use subgroups size of 64 on Adreno.
|
| 30 |
-
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 31 |
-
#define ADRENO_GPU 1
|
| 32 |
-
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
| 33 |
-
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
| 34 |
-
#else
|
| 35 |
-
// TODO: do not know how to choose subgroup size on other GPUs.
|
| 36 |
-
#error "Selecting subgroup size is not supported on your device."
|
| 37 |
-
#endif
|
| 38 |
-
|
| 39 |
-
#define QK4_0 32
|
| 40 |
-
#define QR4_0 2
|
| 41 |
-
#define QK4_1 32
|
| 42 |
-
#define QR4_1 2
|
| 43 |
-
#define QK5_0 32
|
| 44 |
-
#define QR5_0 2
|
| 45 |
-
#define QK5_1 32
|
| 46 |
-
#define QR5_1 2
|
| 47 |
-
#define QK8_0 32
|
| 48 |
-
#define QR8_0 1
|
| 49 |
-
#define QK_K 256
|
| 50 |
-
#define K_QUANTS_PER_ITERATION 2
|
| 51 |
-
|
| 52 |
-
typedef char int8_t;
|
| 53 |
-
typedef uchar uint8_t;
|
| 54 |
-
typedef short int16_t;
|
| 55 |
-
typedef ushort uint16_t;
|
| 56 |
-
typedef int int32_t;
|
| 57 |
-
typedef uint uint32_t;
|
| 58 |
-
|
| 59 |
-
//------------------------------------------------------------------------------
|
| 60 |
-
// block_q4_0
|
| 61 |
-
//------------------------------------------------------------------------------
|
| 62 |
-
struct block_q4_0
|
| 63 |
-
{
|
| 64 |
-
half d;
|
| 65 |
-
uint8_t qs[QK4_0 / 2];
|
| 66 |
-
};
|
| 67 |
-
|
| 68 |
-
//------------------------------------------------------------------------------
|
| 69 |
-
// mul_vec_q_n_f32_flat_noshuffle
|
| 70 |
-
//
|
| 71 |
-
// This variation uses flat arrays (struct of arrays, SOA) representation for
|
| 72 |
-
// quant tensors. It also uses non shuffled bit order for weights.
|
| 73 |
-
//
|
| 74 |
-
// The shuffled version is kept in the original file because moving it here
|
| 75 |
-
// seems to result in worse performance for adreno.
|
| 76 |
-
//------------------------------------------------------------------------------
|
| 77 |
-
|
| 78 |
-
kernel void kernel_convert_block_q4_0_noshuffle(
|
| 79 |
-
global struct block_q4_0 * src0,
|
| 80 |
-
global uchar * dst_q,
|
| 81 |
-
global half * dst_d
|
| 82 |
-
) {
|
| 83 |
-
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
|
| 84 |
-
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
|
| 85 |
-
global half * d = (global half *) dst_d + get_global_id(0);
|
| 86 |
-
|
| 87 |
-
*d = b->d;
|
| 88 |
-
for (int i = 0; i < QK4_0/4; ++i) {
|
| 89 |
-
uchar x0 = b->qs[2*i + 0];
|
| 90 |
-
uchar x1 = b->qs[2*i + 1];
|
| 91 |
-
|
| 92 |
-
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
| 93 |
-
q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
| 94 |
-
|
| 95 |
-
#ifdef ADRENO_GPU
|
| 96 |
-
// Workaround for adreno - must have the following printf statement for
|
| 97 |
-
// the kernel to work properly. Otherwise it produces incorrect result.
|
| 98 |
-
// convert_uchar above also seems necessary.
|
| 99 |
-
// Compare against a large number so that it does not print anything.
|
| 100 |
-
// get_sub_group_local_id() also works.
|
| 101 |
-
if (get_global_id(0) == 65536*4096) {
|
| 102 |
-
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
| 103 |
-
}
|
| 104 |
-
#endif
|
| 105 |
-
}
|
| 106 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
DELETED
|
@@ -1,268 +0,0 @@
|
|
| 1 |
-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
-
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 3 |
-
|
| 4 |
-
#ifdef cl_qcom_reqd_sub_group_size
|
| 5 |
-
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 6 |
-
#define ADRENO_GPU 1
|
| 7 |
-
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
| 8 |
-
#endif
|
| 9 |
-
|
| 10 |
-
// assume
|
| 11 |
-
#define QK4_0 32
|
| 12 |
-
#define N_SIMDGROUP 4
|
| 13 |
-
|
| 14 |
-
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
|
| 15 |
-
float shared_y; \
|
| 16 |
-
shared_y = sub_group_broadcast(y.s0, 0); \
|
| 17 |
-
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 18 |
-
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 19 |
-
shared_y = sub_group_broadcast(y.s1, 0); \
|
| 20 |
-
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 21 |
-
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 22 |
-
shared_y = sub_group_broadcast(y.s2, 0); \
|
| 23 |
-
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 24 |
-
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 25 |
-
shared_y = sub_group_broadcast(y.s3, 0); \
|
| 26 |
-
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 27 |
-
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 28 |
-
shared_y = sub_group_broadcast(y.s4, 0); \
|
| 29 |
-
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 30 |
-
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 31 |
-
shared_y = sub_group_broadcast(y.s5, 0); \
|
| 32 |
-
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 33 |
-
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 34 |
-
shared_y = sub_group_broadcast(y.s6, 0); \
|
| 35 |
-
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 36 |
-
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 37 |
-
shared_y = sub_group_broadcast(y.s7, 0); \
|
| 38 |
-
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 39 |
-
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 40 |
-
shared_y = sub_group_broadcast(y.s0, 1); \
|
| 41 |
-
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 42 |
-
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 43 |
-
shared_y = sub_group_broadcast(y.s1, 1); \
|
| 44 |
-
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 45 |
-
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 46 |
-
shared_y = sub_group_broadcast(y.s2, 1); \
|
| 47 |
-
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 48 |
-
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 49 |
-
shared_y = sub_group_broadcast(y.s3, 1); \
|
| 50 |
-
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 51 |
-
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 52 |
-
shared_y = sub_group_broadcast(y.s4, 1); \
|
| 53 |
-
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 54 |
-
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 55 |
-
shared_y = sub_group_broadcast(y.s5, 1); \
|
| 56 |
-
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 57 |
-
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 58 |
-
shared_y = sub_group_broadcast(y.s6, 1); \
|
| 59 |
-
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 60 |
-
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 61 |
-
shared_y = sub_group_broadcast(y.s7, 1); \
|
| 62 |
-
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 63 |
-
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
|
| 67 |
-
shared_y = sub_group_broadcast(y.s0, 2); \
|
| 68 |
-
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 69 |
-
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 70 |
-
shared_y = sub_group_broadcast(y.s1, 2); \
|
| 71 |
-
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 72 |
-
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 73 |
-
shared_y = sub_group_broadcast(y.s2, 2); \
|
| 74 |
-
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 75 |
-
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 76 |
-
shared_y = sub_group_broadcast(y.s3, 2); \
|
| 77 |
-
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 78 |
-
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 79 |
-
shared_y = sub_group_broadcast(y.s4, 2); \
|
| 80 |
-
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 81 |
-
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 82 |
-
shared_y = sub_group_broadcast(y.s5, 2); \
|
| 83 |
-
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 84 |
-
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 85 |
-
shared_y = sub_group_broadcast(y.s6, 2); \
|
| 86 |
-
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 87 |
-
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 88 |
-
shared_y = sub_group_broadcast(y.s7, 2); \
|
| 89 |
-
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 90 |
-
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 91 |
-
shared_y = sub_group_broadcast(y.s0, 3); \
|
| 92 |
-
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 93 |
-
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 94 |
-
shared_y = sub_group_broadcast(y.s1, 3); \
|
| 95 |
-
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 96 |
-
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 97 |
-
shared_y = sub_group_broadcast(y.s2, 3); \
|
| 98 |
-
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 99 |
-
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 100 |
-
shared_y = sub_group_broadcast(y.s3, 3); \
|
| 101 |
-
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 102 |
-
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 103 |
-
shared_y = sub_group_broadcast(y.s4, 3); \
|
| 104 |
-
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 105 |
-
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 106 |
-
shared_y = sub_group_broadcast(y.s5, 3); \
|
| 107 |
-
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 108 |
-
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 109 |
-
shared_y = sub_group_broadcast(y.s6, 3); \
|
| 110 |
-
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 111 |
-
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 112 |
-
shared_y = sub_group_broadcast(y.s7, 3); \
|
| 113 |
-
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 114 |
-
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
|
| 118 |
-
float8 shared_y; \
|
| 119 |
-
shared_y = sub_group_broadcast(y, 0); \
|
| 120 |
-
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
| 121 |
-
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
| 122 |
-
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
| 123 |
-
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
| 124 |
-
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
| 125 |
-
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
| 126 |
-
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
| 127 |
-
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
| 128 |
-
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
| 129 |
-
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
| 130 |
-
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
| 131 |
-
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
| 132 |
-
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
| 133 |
-
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
| 134 |
-
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 135 |
-
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 136 |
-
shared_y = sub_group_broadcast(y, 1); \
|
| 137 |
-
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
| 138 |
-
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
| 139 |
-
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
| 140 |
-
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
| 141 |
-
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
| 142 |
-
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
| 143 |
-
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
| 144 |
-
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
| 145 |
-
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
| 146 |
-
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
| 147 |
-
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
| 148 |
-
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
| 149 |
-
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
| 150 |
-
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
| 151 |
-
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 152 |
-
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
|
| 156 |
-
shared_y = sub_group_broadcast(y, 2); \
|
| 157 |
-
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
| 158 |
-
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
| 159 |
-
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
| 160 |
-
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
| 161 |
-
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
| 162 |
-
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
| 163 |
-
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
| 164 |
-
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
| 165 |
-
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
| 166 |
-
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
| 167 |
-
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
| 168 |
-
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
| 169 |
-
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
| 170 |
-
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
| 171 |
-
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 172 |
-
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 173 |
-
shared_y = sub_group_broadcast(y, 3); \
|
| 174 |
-
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
| 175 |
-
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
| 176 |
-
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
| 177 |
-
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
| 178 |
-
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
| 179 |
-
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
| 180 |
-
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
| 181 |
-
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
| 182 |
-
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
| 183 |
-
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
| 184 |
-
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
| 185 |
-
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
| 186 |
-
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
| 187 |
-
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
| 188 |
-
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 189 |
-
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 190 |
-
|
| 191 |
-
#ifdef ADRENO_GPU
|
| 192 |
-
REQD_SUBGROUP_SIZE_64
|
| 193 |
-
#endif
|
| 194 |
-
__kernel void kernel_gemv_noshuffle(
|
| 195 |
-
__read_only image1d_buffer_t src0_q, // quantized A
|
| 196 |
-
global half2 * src0_d, // A scales
|
| 197 |
-
__read_only image1d_buffer_t src1, // B
|
| 198 |
-
ulong offset1, // offset to B (0)
|
| 199 |
-
global float * dst, // C
|
| 200 |
-
ulong offsetd, // offset to C (0)
|
| 201 |
-
uint K, // K
|
| 202 |
-
int ne01, // M
|
| 203 |
-
int ne02, // 1
|
| 204 |
-
int ne10, // K
|
| 205 |
-
int ne12, // 1
|
| 206 |
-
int ne0, // M
|
| 207 |
-
int ne1, // N
|
| 208 |
-
int r2, // 1
|
| 209 |
-
int r3)
|
| 210 |
-
{
|
| 211 |
-
uint groupId = get_local_id(1);
|
| 212 |
-
uint gid = get_global_id(0);
|
| 213 |
-
ushort slid = get_sub_group_local_id();
|
| 214 |
-
|
| 215 |
-
__private uint4 regA;
|
| 216 |
-
__private half2 regS;
|
| 217 |
-
__private float8 regB;
|
| 218 |
-
|
| 219 |
-
__private float2 totalSum = (float2)(0.0f);
|
| 220 |
-
|
| 221 |
-
// loop along K in block granularity, skip 4 blocks every iter
|
| 222 |
-
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
|
| 223 |
-
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
|
| 224 |
-
// first 4 fibers in each wave load 8 B values to its private scope
|
| 225 |
-
if (slid < 4) {
|
| 226 |
-
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
| 227 |
-
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
| 228 |
-
}
|
| 229 |
-
|
| 230 |
-
// load half weights for two blocks in consecutive rows
|
| 231 |
-
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
| 232 |
-
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
| 233 |
-
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
| 234 |
-
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
| 235 |
-
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
| 236 |
-
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
|
| 237 |
-
#else
|
| 238 |
-
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
|
| 239 |
-
#endif // VECTOR_SUB_GROUP_BROADCAT
|
| 240 |
-
|
| 241 |
-
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
| 242 |
-
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
| 243 |
-
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
| 244 |
-
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
| 245 |
-
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
| 246 |
-
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
|
| 247 |
-
#else
|
| 248 |
-
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
|
| 249 |
-
#endif // VECTOR_SUB_GROUP_BROADCAT
|
| 250 |
-
}
|
| 251 |
-
|
| 252 |
-
// reduction in local memory, assumes #wave=4
|
| 253 |
-
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
|
| 254 |
-
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
|
| 255 |
-
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
|
| 256 |
-
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
|
| 257 |
-
barrier(CLK_LOCAL_MEM_FENCE);
|
| 258 |
-
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
| 259 |
-
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
| 260 |
-
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
| 261 |
-
|
| 262 |
-
// 2 outputs per fiber in wave 0
|
| 263 |
-
if (groupId == 0) {
|
| 264 |
-
dst = (global float*)((global char*)dst + offsetd);
|
| 265 |
-
vstore2(totalSum, 0, &(dst[gid * 2]));
|
| 266 |
-
}
|
| 267 |
-
|
| 268 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
DELETED
|
@@ -1,274 +0,0 @@
|
|
| 1 |
-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
-
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 3 |
-
|
| 4 |
-
#ifdef cl_qcom_reqd_sub_group_size
|
| 5 |
-
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 6 |
-
#define ADRENO_GPU 1
|
| 7 |
-
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
| 8 |
-
#endif
|
| 9 |
-
|
| 10 |
-
// assume
|
| 11 |
-
#define QK4_0 32
|
| 12 |
-
#define N_SIMDGROUP 4
|
| 13 |
-
|
| 14 |
-
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
|
| 15 |
-
float shared_y; \
|
| 16 |
-
shared_y = sub_group_broadcast(y.s0, 0); \
|
| 17 |
-
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 18 |
-
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 19 |
-
shared_y = sub_group_broadcast(y.s1, 0); \
|
| 20 |
-
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 21 |
-
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 22 |
-
shared_y = sub_group_broadcast(y.s2, 0); \
|
| 23 |
-
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 24 |
-
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 25 |
-
shared_y = sub_group_broadcast(y.s3, 0); \
|
| 26 |
-
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 27 |
-
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 28 |
-
shared_y = sub_group_broadcast(y.s4, 0); \
|
| 29 |
-
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 30 |
-
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 31 |
-
shared_y = sub_group_broadcast(y.s5, 0); \
|
| 32 |
-
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 33 |
-
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 34 |
-
shared_y = sub_group_broadcast(y.s6, 0); \
|
| 35 |
-
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 36 |
-
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 37 |
-
shared_y = sub_group_broadcast(y.s7, 0); \
|
| 38 |
-
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 39 |
-
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 40 |
-
shared_y = sub_group_broadcast(y.s0, 1); \
|
| 41 |
-
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 42 |
-
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 43 |
-
shared_y = sub_group_broadcast(y.s1, 1); \
|
| 44 |
-
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 45 |
-
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 46 |
-
shared_y = sub_group_broadcast(y.s2, 1); \
|
| 47 |
-
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 48 |
-
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 49 |
-
shared_y = sub_group_broadcast(y.s3, 1); \
|
| 50 |
-
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 51 |
-
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 52 |
-
shared_y = sub_group_broadcast(y.s4, 1); \
|
| 53 |
-
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 54 |
-
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 55 |
-
shared_y = sub_group_broadcast(y.s5, 1); \
|
| 56 |
-
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 57 |
-
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 58 |
-
shared_y = sub_group_broadcast(y.s6, 1); \
|
| 59 |
-
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 60 |
-
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 61 |
-
shared_y = sub_group_broadcast(y.s7, 1); \
|
| 62 |
-
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 63 |
-
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
|
| 67 |
-
shared_y = sub_group_broadcast(y.s0, 2); \
|
| 68 |
-
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 69 |
-
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 70 |
-
shared_y = sub_group_broadcast(y.s1, 2); \
|
| 71 |
-
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 72 |
-
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 73 |
-
shared_y = sub_group_broadcast(y.s2, 2); \
|
| 74 |
-
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 75 |
-
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 76 |
-
shared_y = sub_group_broadcast(y.s3, 2); \
|
| 77 |
-
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 78 |
-
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 79 |
-
shared_y = sub_group_broadcast(y.s4, 2); \
|
| 80 |
-
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 81 |
-
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 82 |
-
shared_y = sub_group_broadcast(y.s5, 2); \
|
| 83 |
-
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 84 |
-
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 85 |
-
shared_y = sub_group_broadcast(y.s6, 2); \
|
| 86 |
-
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 87 |
-
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 88 |
-
shared_y = sub_group_broadcast(y.s7, 2); \
|
| 89 |
-
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 90 |
-
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 91 |
-
shared_y = sub_group_broadcast(y.s0, 3); \
|
| 92 |
-
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 93 |
-
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 94 |
-
shared_y = sub_group_broadcast(y.s1, 3); \
|
| 95 |
-
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 96 |
-
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 97 |
-
shared_y = sub_group_broadcast(y.s2, 3); \
|
| 98 |
-
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 99 |
-
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 100 |
-
shared_y = sub_group_broadcast(y.s3, 3); \
|
| 101 |
-
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 102 |
-
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 103 |
-
shared_y = sub_group_broadcast(y.s4, 3); \
|
| 104 |
-
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
| 105 |
-
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
| 106 |
-
shared_y = sub_group_broadcast(y.s5, 3); \
|
| 107 |
-
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
| 108 |
-
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
| 109 |
-
shared_y = sub_group_broadcast(y.s6, 3); \
|
| 110 |
-
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
| 111 |
-
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
| 112 |
-
shared_y = sub_group_broadcast(y.s7, 3); \
|
| 113 |
-
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
| 114 |
-
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
|
| 118 |
-
float8 shared_y; \
|
| 119 |
-
shared_y = sub_group_broadcast(y, 0); \
|
| 120 |
-
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
| 121 |
-
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
| 122 |
-
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
| 123 |
-
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
| 124 |
-
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
| 125 |
-
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
| 126 |
-
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
| 127 |
-
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
| 128 |
-
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
| 129 |
-
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
| 130 |
-
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
| 131 |
-
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
| 132 |
-
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
| 133 |
-
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
| 134 |
-
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 135 |
-
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 136 |
-
shared_y = sub_group_broadcast(y, 1); \
|
| 137 |
-
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
| 138 |
-
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
| 139 |
-
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
| 140 |
-
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
| 141 |
-
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
| 142 |
-
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
| 143 |
-
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
| 144 |
-
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
| 145 |
-
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
| 146 |
-
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
| 147 |
-
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
| 148 |
-
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
| 149 |
-
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
| 150 |
-
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
| 151 |
-
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 152 |
-
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
|
| 156 |
-
shared_y = sub_group_broadcast(y, 2); \
|
| 157 |
-
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
| 158 |
-
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
| 159 |
-
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
| 160 |
-
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
| 161 |
-
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
| 162 |
-
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
| 163 |
-
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
| 164 |
-
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
| 165 |
-
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
| 166 |
-
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
| 167 |
-
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
| 168 |
-
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
| 169 |
-
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
| 170 |
-
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
| 171 |
-
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 172 |
-
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 173 |
-
shared_y = sub_group_broadcast(y, 3); \
|
| 174 |
-
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
| 175 |
-
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
| 176 |
-
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
| 177 |
-
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
| 178 |
-
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
| 179 |
-
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
| 180 |
-
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
| 181 |
-
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
| 182 |
-
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
| 183 |
-
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
| 184 |
-
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
| 185 |
-
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
| 186 |
-
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
| 187 |
-
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
| 188 |
-
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 189 |
-
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 190 |
-
|
| 191 |
-
#ifdef ADRENO_GPU
|
| 192 |
-
REQD_SUBGROUP_SIZE_64
|
| 193 |
-
#endif
|
| 194 |
-
__kernel void kernel_gemv_noshuffle(
|
| 195 |
-
__read_only image1d_buffer_t src0_q, // quantized A
|
| 196 |
-
global half2 * src0_d, // A scales
|
| 197 |
-
__read_only image1d_buffer_t src1, // B
|
| 198 |
-
ulong offset1, // offset to B (0)
|
| 199 |
-
global float * dst, // C
|
| 200 |
-
ulong offsetd, // offset to C (0)
|
| 201 |
-
int ne00, // K
|
| 202 |
-
int ne01, // M
|
| 203 |
-
int ne02, // 1
|
| 204 |
-
int ne10, // K
|
| 205 |
-
int ne12, // 1
|
| 206 |
-
int ne0, // M
|
| 207 |
-
int ne1, // N
|
| 208 |
-
int r2, // 1
|
| 209 |
-
int r3)
|
| 210 |
-
{
|
| 211 |
-
uint groupId = get_local_id(1);
|
| 212 |
-
uint gid = get_global_id(0);
|
| 213 |
-
ushort slid = get_sub_group_local_id();
|
| 214 |
-
|
| 215 |
-
uint K = ne00;
|
| 216 |
-
uint M = ne01;
|
| 217 |
-
|
| 218 |
-
uint LINE_STRIDE_A = M / 2;
|
| 219 |
-
uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
|
| 220 |
-
|
| 221 |
-
__private uint4 regA;
|
| 222 |
-
__private half2 regS;
|
| 223 |
-
__private float8 regB;
|
| 224 |
-
|
| 225 |
-
__private float2 totalSum = (float2)(0.0f);
|
| 226 |
-
|
| 227 |
-
// loop along K in block granularity, skip 4 blocks every iter
|
| 228 |
-
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
|
| 229 |
-
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
|
| 230 |
-
// first 4 fibers in each wave load 8 B values to its private scope
|
| 231 |
-
if (slid < 4) {
|
| 232 |
-
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
| 233 |
-
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
| 234 |
-
}
|
| 235 |
-
|
| 236 |
-
// load half weights for two blocks in consecutive rows
|
| 237 |
-
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
| 238 |
-
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
| 239 |
-
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
| 240 |
-
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
| 241 |
-
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
| 242 |
-
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
|
| 243 |
-
#else
|
| 244 |
-
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
|
| 245 |
-
#endif // VECTOR_SUB_GROUP_BROADCAT
|
| 246 |
-
|
| 247 |
-
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
| 248 |
-
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
| 249 |
-
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
| 250 |
-
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
| 251 |
-
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
| 252 |
-
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
|
| 253 |
-
#else
|
| 254 |
-
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
|
| 255 |
-
#endif // VECTOR_SUB_GROUP_BROADCAT
|
| 256 |
-
}
|
| 257 |
-
|
| 258 |
-
// reduction in local memory, assumes #wave=4
|
| 259 |
-
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
|
| 260 |
-
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
|
| 261 |
-
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
|
| 262 |
-
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
|
| 263 |
-
barrier(CLK_LOCAL_MEM_FENCE);
|
| 264 |
-
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
| 265 |
-
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
| 266 |
-
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
| 267 |
-
|
| 268 |
-
// 2 outputs per fiber in wave 0
|
| 269 |
-
if (groupId == 0) {
|
| 270 |
-
dst = (global float*)((global char*)dst + offsetd);
|
| 271 |
-
vstore2(totalSum, 0, &(dst[gid * 2]));
|
| 272 |
-
}
|
| 273 |
-
|
| 274 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-opencl/kernels/ggml-opencl_im2col.cl
DELETED
|
@@ -1,146 +0,0 @@
|
|
| 1 |
-
#ifdef cl_khr_fp16
|
| 2 |
-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 3 |
-
#elif defined(cl_amd_fp16)
|
| 4 |
-
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
|
| 5 |
-
#else
|
| 6 |
-
#error "Half precision floating point not supportedby OpenCL implementation on your device."
|
| 7 |
-
#endif
|
| 8 |
-
|
| 9 |
-
#ifdef cl_khr_subgroups
|
| 10 |
-
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 11 |
-
#elif defined(cl_intel_subgroups)
|
| 12 |
-
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
| 13 |
-
#else
|
| 14 |
-
#error "Subgroup not supported on your device."
|
| 15 |
-
#endif
|
| 16 |
-
|
| 17 |
-
#ifdef cl_intel_required_subgroup_size
|
| 18 |
-
// Always use subgroup size of 32 on Intel.
|
| 19 |
-
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
| 20 |
-
#define INTEL_GPU 1
|
| 21 |
-
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
| 22 |
-
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
| 23 |
-
#elif defined(cl_qcom_reqd_sub_group_size)
|
| 24 |
-
// Always use subgroups size of 64 on Adreno.
|
| 25 |
-
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 26 |
-
#define ADRENO_GPU 1
|
| 27 |
-
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
| 28 |
-
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
| 29 |
-
#else
|
| 30 |
-
// TODO: do not know how to choose subgroup size on other GPUs.
|
| 31 |
-
#error "Selecting subgroup size is not supported on your device."
|
| 32 |
-
#endif
|
| 33 |
-
|
| 34 |
-
kernel void kernel_im2col_f32(
|
| 35 |
-
global float * src1,
|
| 36 |
-
ulong offset1,
|
| 37 |
-
global float * dst,
|
| 38 |
-
ulong offsetd,
|
| 39 |
-
ulong batch_offset,
|
| 40 |
-
ulong delta_offset,
|
| 41 |
-
long IW,
|
| 42 |
-
long IH,
|
| 43 |
-
long IC,
|
| 44 |
-
long OW,
|
| 45 |
-
long OH,
|
| 46 |
-
long KW,
|
| 47 |
-
long KH,
|
| 48 |
-
long pelements,
|
| 49 |
-
long CHW,
|
| 50 |
-
int s0,
|
| 51 |
-
int s1,
|
| 52 |
-
int p0,
|
| 53 |
-
int p1,
|
| 54 |
-
int d0,
|
| 55 |
-
int d1
|
| 56 |
-
) {
|
| 57 |
-
// threadIdx.x + blockIdx.x * blockDim.x
|
| 58 |
-
long i = get_global_id(0);
|
| 59 |
-
if (i >= pelements) {
|
| 60 |
-
return;
|
| 61 |
-
}
|
| 62 |
-
|
| 63 |
-
src1 = (global float*)((global char*)src1 + offset1);
|
| 64 |
-
dst = (global float*)((global char*)dst + offsetd);
|
| 65 |
-
|
| 66 |
-
long ksize = OW * (KH > 1 ? KW : 1);
|
| 67 |
-
long kx = i / ksize;
|
| 68 |
-
long kd = kx * ksize;
|
| 69 |
-
long ky = (i - kd) / OW;
|
| 70 |
-
long ix = i % OW;
|
| 71 |
-
|
| 72 |
-
long oh = get_group_id(1);
|
| 73 |
-
long batch = get_group_id(2) / IC;
|
| 74 |
-
long ic = get_group_id(2) % IC;
|
| 75 |
-
|
| 76 |
-
long iiw = ix * s0 + kx * d0 - p0;
|
| 77 |
-
long iih = oh * s1 + ky * d1 - p1;
|
| 78 |
-
|
| 79 |
-
long offset_dst =
|
| 80 |
-
((batch * OH + oh) * OW + ix) * CHW +
|
| 81 |
-
(ic * (KW * KH) + ky * KW + kx);
|
| 82 |
-
|
| 83 |
-
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
| 84 |
-
dst[offset_dst] = 0.0f;
|
| 85 |
-
} else {
|
| 86 |
-
long offset_src = ic * delta_offset + batch * batch_offset;
|
| 87 |
-
dst[offset_dst] = src1[offset_src + iih * IW + iiw];
|
| 88 |
-
}
|
| 89 |
-
}
|
| 90 |
-
|
| 91 |
-
kernel void kernel_im2col_f16(
|
| 92 |
-
global float * src1,
|
| 93 |
-
ulong offset1,
|
| 94 |
-
global half * dst,
|
| 95 |
-
ulong offsetd,
|
| 96 |
-
ulong batch_offset,
|
| 97 |
-
ulong delta_offset,
|
| 98 |
-
long IW,
|
| 99 |
-
long IH,
|
| 100 |
-
long IC,
|
| 101 |
-
long OW,
|
| 102 |
-
long OH,
|
| 103 |
-
long KW,
|
| 104 |
-
long KH,
|
| 105 |
-
long pelements,
|
| 106 |
-
long CHW,
|
| 107 |
-
int s0,
|
| 108 |
-
int s1,
|
| 109 |
-
int p0,
|
| 110 |
-
int p1,
|
| 111 |
-
int d0,
|
| 112 |
-
int d1
|
| 113 |
-
) {
|
| 114 |
-
long i = get_global_id(0);
|
| 115 |
-
|
| 116 |
-
if (i >= pelements) {
|
| 117 |
-
return;
|
| 118 |
-
}
|
| 119 |
-
|
| 120 |
-
src1 = (global float*)((global char*)src1 + offset1);
|
| 121 |
-
dst = (global half*)((global char*)dst + offsetd);
|
| 122 |
-
|
| 123 |
-
long ksize = OW * (KH > 1 ? KW : 1);
|
| 124 |
-
long kx = i / ksize;
|
| 125 |
-
long kd = kx * ksize;
|
| 126 |
-
long ky = (i - kd) / OW;
|
| 127 |
-
long ix = i % OW;
|
| 128 |
-
|
| 129 |
-
long oh = get_group_id(1);
|
| 130 |
-
long batch = get_group_id(2) / IC;
|
| 131 |
-
long ic = get_group_id(2) % IC;
|
| 132 |
-
|
| 133 |
-
long iiw = ix * s0 + kx * d0 - p0;
|
| 134 |
-
long iih = oh * s1 + ky * d1 - p1;
|
| 135 |
-
|
| 136 |
-
long offset_dst =
|
| 137 |
-
((batch * OH + oh) * OW + ix) * CHW +
|
| 138 |
-
(ic * (KW * KH) + ky * KW + kx);
|
| 139 |
-
|
| 140 |
-
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
| 141 |
-
dst[offset_dst] = 0.0f;
|
| 142 |
-
} else {
|
| 143 |
-
long offset_src = ic * delta_offset + batch * batch_offset;
|
| 144 |
-
dst[offset_dst] = src1[offset_src + iih * IW + iiw];
|
| 145 |
-
}
|
| 146 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
DELETED
|
@@ -1,1225 +0,0 @@
|
|
| 1 |
-
//------------------------------------------------------------------------------
|
| 2 |
-
// This file is contains additional mulmat kernels
|
| 3 |
-
// (and potentially other kernels).
|
| 4 |
-
//------------------------------------------------------------------------------
|
| 5 |
-
#ifdef cl_khr_fp16
|
| 6 |
-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 7 |
-
#elif defined(cl_amd_fp16)
|
| 8 |
-
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
|
| 9 |
-
#else
|
| 10 |
-
#error "Half precision floating point not supportedby OpenCL implementation on your device."
|
| 11 |
-
#endif
|
| 12 |
-
|
| 13 |
-
#ifdef cl_khr_subgroups
|
| 14 |
-
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 15 |
-
#elif defined(cl_intel_subgroups)
|
| 16 |
-
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
| 17 |
-
#else
|
| 18 |
-
#error "Subgroup not supported on your device."
|
| 19 |
-
#endif
|
| 20 |
-
|
| 21 |
-
#ifdef cl_intel_required_subgroup_size
|
| 22 |
-
// Always use subgroup size of 32 on Intel.
|
| 23 |
-
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
| 24 |
-
#define INTEL_GPU 1
|
| 25 |
-
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
| 26 |
-
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
| 27 |
-
#elif defined(cl_qcom_reqd_sub_group_size)
|
| 28 |
-
// Always use subgroups size of 64 on Adreno.
|
| 29 |
-
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 30 |
-
#define ADRENO_GPU 1
|
| 31 |
-
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
| 32 |
-
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
| 33 |
-
#else
|
| 34 |
-
// TODO: do not know how to choose subgroup size on other GPUs.
|
| 35 |
-
#error "Selecting subgroup size is not supported on your device."
|
| 36 |
-
#endif
|
| 37 |
-
|
| 38 |
-
#define QK4_0 32
|
| 39 |
-
#define QR4_0 2
|
| 40 |
-
#define QK4_1 32
|
| 41 |
-
#define QR4_1 2
|
| 42 |
-
#define QK5_0 32
|
| 43 |
-
#define QR5_0 2
|
| 44 |
-
#define QK5_1 32
|
| 45 |
-
#define QR5_1 2
|
| 46 |
-
#define QK8_0 32
|
| 47 |
-
#define QR8_0 1
|
| 48 |
-
#define QK_K 256
|
| 49 |
-
#define K_QUANTS_PER_ITERATION 2
|
| 50 |
-
|
| 51 |
-
typedef char int8_t;
|
| 52 |
-
typedef uchar uint8_t;
|
| 53 |
-
typedef short int16_t;
|
| 54 |
-
typedef ushort uint16_t;
|
| 55 |
-
typedef int int32_t;
|
| 56 |
-
typedef uint uint32_t;
|
| 57 |
-
|
| 58 |
-
//------------------------------------------------------------------------------
|
| 59 |
-
// block_q4_0
|
| 60 |
-
//------------------------------------------------------------------------------
|
| 61 |
-
struct block_q4_0
|
| 62 |
-
{
|
| 63 |
-
half d;
|
| 64 |
-
uint8_t qs[QK4_0 / 2];
|
| 65 |
-
};
|
| 66 |
-
|
| 67 |
-
//------------------------------------------------------------------------------
|
| 68 |
-
// block_q6_K
|
| 69 |
-
//------------------------------------------------------------------------------
|
| 70 |
-
// 6-bit quantization
|
| 71 |
-
// weight is represented as x = a * q
|
| 72 |
-
// 16 blocks of 16 elements each
|
| 73 |
-
// Effectively 6.5625 bits per weight
|
| 74 |
-
typedef struct {
|
| 75 |
-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
| 76 |
-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
| 77 |
-
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
| 78 |
-
half d; // super-block scale
|
| 79 |
-
} block_q6_K;
|
| 80 |
-
|
| 81 |
-
//------------------------------------------------------------------------------
|
| 82 |
-
// These are the variant for matmatmul, based on the matvecmul kernel with
|
| 83 |
-
// flattened block_q4_0.
|
| 84 |
-
//------------------------------------------------------------------------------
|
| 85 |
-
|
| 86 |
-
// Common dot prod.
|
| 87 |
-
inline float mm_block_q_4_0_dot_y_flat(
|
| 88 |
-
global uchar * x,
|
| 89 |
-
global half * dh,
|
| 90 |
-
float sumy,
|
| 91 |
-
float16 yl,
|
| 92 |
-
int il
|
| 93 |
-
) {
|
| 94 |
-
float d = *dh;
|
| 95 |
-
global ushort * qs = ((global ushort *)x + il/2);
|
| 96 |
-
float acc = 0.f;
|
| 97 |
-
|
| 98 |
-
acc += yl.s0 * (qs[0] & 0x000F);
|
| 99 |
-
acc += yl.s1 * (qs[0] & 0x0F00);
|
| 100 |
-
acc += yl.s8 * (qs[0] & 0x00F0);
|
| 101 |
-
acc += yl.s9 * (qs[0] & 0xF000);
|
| 102 |
-
|
| 103 |
-
acc += yl.s2 * (qs[1] & 0x000F);
|
| 104 |
-
acc += yl.s3 * (qs[1] & 0x0F00);
|
| 105 |
-
acc += yl.sa * (qs[1] & 0x00F0);
|
| 106 |
-
acc += yl.sb * (qs[1] & 0xF000);
|
| 107 |
-
|
| 108 |
-
acc += yl.s4 * (qs[2] & 0x000F);
|
| 109 |
-
acc += yl.s5 * (qs[2] & 0x0F00);
|
| 110 |
-
acc += yl.sc * (qs[2] & 0x00F0);
|
| 111 |
-
acc += yl.sd * (qs[2] & 0xF000);
|
| 112 |
-
|
| 113 |
-
acc += yl.s6 * (qs[3] & 0x000F);
|
| 114 |
-
acc += yl.s7 * (qs[3] & 0x0F00);
|
| 115 |
-
acc += yl.se * (qs[3] & 0x00F0);
|
| 116 |
-
acc += yl.sf * (qs[3] & 0xF000);
|
| 117 |
-
|
| 118 |
-
return d * (sumy * -8.f + acc);
|
| 119 |
-
}
|
| 120 |
-
|
| 121 |
-
#undef N_DST
|
| 122 |
-
#undef N_SIMDGROUP
|
| 123 |
-
#undef N_SIMDWIDTH
|
| 124 |
-
|
| 125 |
-
#ifdef INTEL_GPU
|
| 126 |
-
#define N_DST 8 // each SIMD group works on 8 rows (in weights matrix)
|
| 127 |
-
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
| 128 |
-
#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
|
| 129 |
-
#elif defined (ADRENO_GPU)
|
| 130 |
-
#define N_DST 8
|
| 131 |
-
#define N_SIMDGROUP 1
|
| 132 |
-
#define N_SIMDWIDTH 64
|
| 133 |
-
#endif
|
| 134 |
-
//
|
| 135 |
-
// This variant performs 1d blocking with 8x output.
|
| 136 |
-
// Eeach simdgroup outputs 8 values on `n0` dim (row in the output matrix).
|
| 137 |
-
//
|
| 138 |
-
inline void mul_mat_q_n_f32_1d_8x_flat(
|
| 139 |
-
global uchar * src0_q,
|
| 140 |
-
global half * src0_d,
|
| 141 |
-
global float * src1,
|
| 142 |
-
global float * dst,
|
| 143 |
-
int ne00,
|
| 144 |
-
int ne01,
|
| 145 |
-
int ne02,
|
| 146 |
-
int ne10,
|
| 147 |
-
int ne12,
|
| 148 |
-
int ne0,
|
| 149 |
-
int ne1,
|
| 150 |
-
int r2,
|
| 151 |
-
int r3
|
| 152 |
-
) {
|
| 153 |
-
const int nb = ne00/QK4_0;
|
| 154 |
-
|
| 155 |
-
int r0 = get_group_id(0);
|
| 156 |
-
int r1 = get_group_id(1);
|
| 157 |
-
int im = get_group_id(2);
|
| 158 |
-
|
| 159 |
-
// (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
|
| 160 |
-
// a SIMD group in the grid. Each SIMD group produces N_DST values in the
|
| 161 |
-
// result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
|
| 162 |
-
// Currently with llama2 7B, im is always 0.
|
| 163 |
-
// TODO: how to handle im/gqa*(nb*ne0)?
|
| 164 |
-
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
| 165 |
-
|
| 166 |
-
int i12 = im%ne12;
|
| 167 |
-
int i13 = im/ne12;
|
| 168 |
-
|
| 169 |
-
// The number of scales is the same as the number of blocks.
|
| 170 |
-
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
| 171 |
-
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
| 172 |
-
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
| 173 |
-
|
| 174 |
-
global uchar * x = (global uchar *) src0_q + offset0_q;
|
| 175 |
-
global half * d = (global half *) src0_d + offset0_d;
|
| 176 |
-
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 177 |
-
|
| 178 |
-
float16 yl;
|
| 179 |
-
float8 sumf = (float8)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
|
| 180 |
-
|
| 181 |
-
int ix = get_sub_group_local_id()/2;
|
| 182 |
-
int il = 8*(get_sub_group_local_id()%2);
|
| 183 |
-
|
| 184 |
-
global float * yb = y + ix*QK4_0 + il;
|
| 185 |
-
|
| 186 |
-
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
| 187 |
-
float sumy = 0.f;
|
| 188 |
-
|
| 189 |
-
sumy += yb[0];
|
| 190 |
-
sumy += yb[1];
|
| 191 |
-
sumy += yb[2];
|
| 192 |
-
sumy += yb[3];
|
| 193 |
-
sumy += yb[4];
|
| 194 |
-
sumy += yb[5];
|
| 195 |
-
sumy += yb[6];
|
| 196 |
-
sumy += yb[7];
|
| 197 |
-
|
| 198 |
-
sumy += yb[16];
|
| 199 |
-
sumy += yb[17];
|
| 200 |
-
sumy += yb[18];
|
| 201 |
-
sumy += yb[19];
|
| 202 |
-
sumy += yb[20];
|
| 203 |
-
sumy += yb[21];
|
| 204 |
-
sumy += yb[22];
|
| 205 |
-
sumy += yb[23];
|
| 206 |
-
|
| 207 |
-
yl.s0 = yb[0];
|
| 208 |
-
yl.s1 = yb[1]/256.f;
|
| 209 |
-
|
| 210 |
-
yl.s2 = yb[2];
|
| 211 |
-
yl.s3 = yb[3]/256.f;
|
| 212 |
-
|
| 213 |
-
yl.s4 = yb[4];
|
| 214 |
-
yl.s5 = yb[5]/256.f;
|
| 215 |
-
|
| 216 |
-
yl.s6 = yb[6];
|
| 217 |
-
yl.s7 = yb[7]/256.f;
|
| 218 |
-
|
| 219 |
-
yl.s8 = yb[16]/16.f;
|
| 220 |
-
yl.s9 = yb[17]/4096.f;
|
| 221 |
-
|
| 222 |
-
yl.sa = yb[18]/16.f;
|
| 223 |
-
yl.sb = yb[19]/4096.f;
|
| 224 |
-
|
| 225 |
-
yl.sc = yb[20]/16.f;
|
| 226 |
-
yl.sd = yb[21]/4096.f;
|
| 227 |
-
|
| 228 |
-
yl.se = yb[22]/16.f;
|
| 229 |
-
yl.sf = yb[23]/4096.f;
|
| 230 |
-
|
| 231 |
-
sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
| 232 |
-
sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
| 233 |
-
sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
| 234 |
-
sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
| 235 |
-
|
| 236 |
-
sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
| 237 |
-
sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
| 238 |
-
sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
| 239 |
-
sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
| 240 |
-
|
| 241 |
-
yb += QK4_0 * (N_SIMDWIDTH/2);
|
| 242 |
-
}
|
| 243 |
-
|
| 244 |
-
float8 tot = (float8)(
|
| 245 |
-
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
| 246 |
-
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
| 247 |
-
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
| 248 |
-
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
| 249 |
-
);
|
| 250 |
-
|
| 251 |
-
if (get_sub_group_local_id() == 0) {
|
| 252 |
-
if (first_row + 0 < ne01) {
|
| 253 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
| 254 |
-
}
|
| 255 |
-
if (first_row + 1 < ne01) {
|
| 256 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
| 257 |
-
}
|
| 258 |
-
if (first_row + 2 < ne01) {
|
| 259 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
| 260 |
-
}
|
| 261 |
-
if (first_row + 3 < ne01) {
|
| 262 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
| 263 |
-
}
|
| 264 |
-
|
| 265 |
-
if (first_row + 4 < ne01) {
|
| 266 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
| 267 |
-
}
|
| 268 |
-
if (first_row + 5 < ne01) {
|
| 269 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
| 270 |
-
}
|
| 271 |
-
if (first_row + 6 < ne01) {
|
| 272 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
| 273 |
-
}
|
| 274 |
-
if (first_row + 7 < ne01) {
|
| 275 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
| 276 |
-
}
|
| 277 |
-
}
|
| 278 |
-
}
|
| 279 |
-
|
| 280 |
-
#ifdef INTEL_GPU
|
| 281 |
-
REQD_SUBGROUP_SIZE_16
|
| 282 |
-
#elif defined (ADRENO_GPU)
|
| 283 |
-
REQD_SUBGROUP_SIZE_64
|
| 284 |
-
#endif
|
| 285 |
-
kernel void kernel_mul_mat_q4_0_f32_1d_8x_flat(
|
| 286 |
-
global uchar * src0_q,
|
| 287 |
-
global half * src0_d,
|
| 288 |
-
global float * src1,
|
| 289 |
-
ulong offset1,
|
| 290 |
-
global float * dst,
|
| 291 |
-
ulong offsetd,
|
| 292 |
-
int ne00,
|
| 293 |
-
int ne01,
|
| 294 |
-
int ne02,
|
| 295 |
-
int ne10,
|
| 296 |
-
int ne12,
|
| 297 |
-
int ne0,
|
| 298 |
-
int ne1,
|
| 299 |
-
int r2,
|
| 300 |
-
int r3
|
| 301 |
-
) {
|
| 302 |
-
src1 = (global float*)((global char*)src1 + offset1);
|
| 303 |
-
dst = (global float*)((global char*)dst + offsetd);
|
| 304 |
-
|
| 305 |
-
mul_mat_q_n_f32_1d_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
| 306 |
-
}
|
| 307 |
-
|
| 308 |
-
#undef N_DST
|
| 309 |
-
#undef N_SIMDGROUP
|
| 310 |
-
#undef N_SIMDWIDTH
|
| 311 |
-
|
| 312 |
-
#ifdef INTEL_GPU
|
| 313 |
-
#define N_DST 16 // each SIMD group works on 8 rows (in weights matrix)
|
| 314 |
-
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
| 315 |
-
#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
|
| 316 |
-
#elif defined (ADRENO_GPU)
|
| 317 |
-
#define N_DST 16
|
| 318 |
-
#define N_SIMDGROUP 1
|
| 319 |
-
#define N_SIMDWIDTH 64
|
| 320 |
-
#endif
|
| 321 |
-
//
|
| 322 |
-
// This variant performs 1d blocking with 16x output.
|
| 323 |
-
// Eeach simdgroup outputs 16 values on `n0` dim (row in the output matrix).
|
| 324 |
-
//
|
| 325 |
-
inline void mul_mat_q_n_f32_1d_16x_flat(
|
| 326 |
-
global uchar * src0_q,
|
| 327 |
-
global half * src0_d,
|
| 328 |
-
global float * src1,
|
| 329 |
-
global float * dst,
|
| 330 |
-
int ne00,
|
| 331 |
-
int ne01,
|
| 332 |
-
int ne02,
|
| 333 |
-
int ne10,
|
| 334 |
-
int ne12,
|
| 335 |
-
int ne0,
|
| 336 |
-
int ne1,
|
| 337 |
-
int r2,
|
| 338 |
-
int r3
|
| 339 |
-
) {
|
| 340 |
-
const int nb = ne00/QK4_0;
|
| 341 |
-
|
| 342 |
-
int r0 = get_group_id(0);
|
| 343 |
-
int r1 = get_group_id(1);
|
| 344 |
-
int im = get_group_id(2);
|
| 345 |
-
|
| 346 |
-
// (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
|
| 347 |
-
// a SIMD group in the grid. Each SIMD group produces N_DST values in the
|
| 348 |
-
// result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
|
| 349 |
-
// Currently with llama2 7B, im is always 0.
|
| 350 |
-
// TODO: how to handle im/gqa*(nb*ne0)?
|
| 351 |
-
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
| 352 |
-
|
| 353 |
-
int i12 = im%ne12;
|
| 354 |
-
int i13 = im/ne12;
|
| 355 |
-
|
| 356 |
-
// The number of scales is the same as the number of blocks.
|
| 357 |
-
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
| 358 |
-
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
| 359 |
-
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
| 360 |
-
|
| 361 |
-
global uchar * x = (global uchar *) src0_q + offset0_q;
|
| 362 |
-
global half * d = (global half *) src0_d + offset0_d;
|
| 363 |
-
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 364 |
-
|
| 365 |
-
float16 yl;
|
| 366 |
-
float16 sumf = (float16)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
| 367 |
-
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
|
| 368 |
-
|
| 369 |
-
int ix = get_sub_group_local_id()/2;
|
| 370 |
-
int il = 8*(get_sub_group_local_id()%2);
|
| 371 |
-
|
| 372 |
-
global float * yb = y + ix*QK4_0 + il;
|
| 373 |
-
|
| 374 |
-
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
| 375 |
-
float sumy = 0.f;
|
| 376 |
-
|
| 377 |
-
sumy += yb[0];
|
| 378 |
-
sumy += yb[1];
|
| 379 |
-
sumy += yb[2];
|
| 380 |
-
sumy += yb[3];
|
| 381 |
-
sumy += yb[4];
|
| 382 |
-
sumy += yb[5];
|
| 383 |
-
sumy += yb[6];
|
| 384 |
-
sumy += yb[7];
|
| 385 |
-
|
| 386 |
-
sumy += yb[16];
|
| 387 |
-
sumy += yb[17];
|
| 388 |
-
sumy += yb[18];
|
| 389 |
-
sumy += yb[19];
|
| 390 |
-
sumy += yb[20];
|
| 391 |
-
sumy += yb[21];
|
| 392 |
-
sumy += yb[22];
|
| 393 |
-
sumy += yb[23];
|
| 394 |
-
|
| 395 |
-
yl.s0 = yb[0];
|
| 396 |
-
yl.s1 = yb[1]/256.f;
|
| 397 |
-
|
| 398 |
-
yl.s2 = yb[2];
|
| 399 |
-
yl.s3 = yb[3]/256.f;
|
| 400 |
-
|
| 401 |
-
yl.s4 = yb[4];
|
| 402 |
-
yl.s5 = yb[5]/256.f;
|
| 403 |
-
|
| 404 |
-
yl.s6 = yb[6];
|
| 405 |
-
yl.s7 = yb[7]/256.f;
|
| 406 |
-
|
| 407 |
-
yl.s8 = yb[16]/16.f;
|
| 408 |
-
yl.s9 = yb[17]/4096.f;
|
| 409 |
-
|
| 410 |
-
yl.sa = yb[18]/16.f;
|
| 411 |
-
yl.sb = yb[19]/4096.f;
|
| 412 |
-
|
| 413 |
-
yl.sc = yb[20]/16.f;
|
| 414 |
-
yl.sd = yb[21]/4096.f;
|
| 415 |
-
|
| 416 |
-
yl.se = yb[22]/16.f;
|
| 417 |
-
yl.sf = yb[23]/4096.f;
|
| 418 |
-
|
| 419 |
-
sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
| 420 |
-
sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
| 421 |
-
sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
| 422 |
-
sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
| 423 |
-
|
| 424 |
-
sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
| 425 |
-
sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
| 426 |
-
sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
| 427 |
-
sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
| 428 |
-
|
| 429 |
-
sumf.s8 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 8*nb*QK4_0/2, d + ib + 8*nb, sumy, yl, il);
|
| 430 |
-
sumf.s9 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 9*nb*QK4_0/2, d + ib + 9*nb, sumy, yl, il);
|
| 431 |
-
sumf.sa += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 10*nb*QK4_0/2, d + ib + 10*nb, sumy, yl, il);
|
| 432 |
-
sumf.sb += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 11*nb*QK4_0/2, d + ib + 11*nb, sumy, yl, il);
|
| 433 |
-
|
| 434 |
-
sumf.sc += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 12*nb*QK4_0/2, d + ib + 12*nb, sumy, yl, il);
|
| 435 |
-
sumf.sd += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 13*nb*QK4_0/2, d + ib + 13*nb, sumy, yl, il);
|
| 436 |
-
sumf.se += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 14*nb*QK4_0/2, d + ib + 14*nb, sumy, yl, il);
|
| 437 |
-
sumf.sf += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 15*nb*QK4_0/2, d + ib + 15*nb, sumy, yl, il);
|
| 438 |
-
|
| 439 |
-
yb += QK4_0 * (N_SIMDWIDTH/2);
|
| 440 |
-
}
|
| 441 |
-
|
| 442 |
-
float16 tot = (float16)(
|
| 443 |
-
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
| 444 |
-
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
| 445 |
-
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
| 446 |
-
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7),
|
| 447 |
-
|
| 448 |
-
sub_group_reduce_add(sumf.s8), sub_group_reduce_add(sumf.s9),
|
| 449 |
-
sub_group_reduce_add(sumf.sa), sub_group_reduce_add(sumf.sb),
|
| 450 |
-
sub_group_reduce_add(sumf.sc), sub_group_reduce_add(sumf.sd),
|
| 451 |
-
sub_group_reduce_add(sumf.se), sub_group_reduce_add(sumf.sf)
|
| 452 |
-
);
|
| 453 |
-
|
| 454 |
-
if (get_sub_group_local_id() == 0) {
|
| 455 |
-
if (first_row + 0 < ne01) {
|
| 456 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
| 457 |
-
}
|
| 458 |
-
if (first_row + 1 < ne01) {
|
| 459 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
| 460 |
-
}
|
| 461 |
-
if (first_row + 2 < ne01) {
|
| 462 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
| 463 |
-
}
|
| 464 |
-
if (first_row + 3 < ne01) {
|
| 465 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
| 466 |
-
}
|
| 467 |
-
|
| 468 |
-
if (first_row + 4 < ne01) {
|
| 469 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
| 470 |
-
}
|
| 471 |
-
if (first_row + 5 < ne01) {
|
| 472 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
| 473 |
-
}
|
| 474 |
-
if (first_row + 6 < ne01) {
|
| 475 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
| 476 |
-
}
|
| 477 |
-
if (first_row + 7 < ne01) {
|
| 478 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
| 479 |
-
}
|
| 480 |
-
|
| 481 |
-
if (first_row + 8 < ne01) {
|
| 482 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 8] = tot.s8;
|
| 483 |
-
}
|
| 484 |
-
if (first_row + 9 < ne01) {
|
| 485 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 9] = tot.s9;
|
| 486 |
-
}
|
| 487 |
-
if (first_row + 10 < ne01) {
|
| 488 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 10] = tot.sa;
|
| 489 |
-
}
|
| 490 |
-
if (first_row + 11 < ne01) {
|
| 491 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 11] = tot.sb;
|
| 492 |
-
}
|
| 493 |
-
|
| 494 |
-
if (first_row + 12 < ne01) {
|
| 495 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 12] = tot.sc;
|
| 496 |
-
}
|
| 497 |
-
if (first_row + 13 < ne01) {
|
| 498 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 13] = tot.sd;
|
| 499 |
-
}
|
| 500 |
-
if (first_row + 14 < ne01) {
|
| 501 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 14] = tot.se;
|
| 502 |
-
}
|
| 503 |
-
if (first_row + 15 < ne01) {
|
| 504 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 15] = tot.sf;
|
| 505 |
-
}
|
| 506 |
-
}
|
| 507 |
-
}
|
| 508 |
-
|
| 509 |
-
#ifdef INTEL_GPU
|
| 510 |
-
REQD_SUBGROUP_SIZE_16
|
| 511 |
-
#elif defined (ADRENO_GPU)
|
| 512 |
-
REQD_SUBGROUP_SIZE_64
|
| 513 |
-
#endif
|
| 514 |
-
kernel void kernel_mul_mat_q4_0_f32_1d_16x_flat(
|
| 515 |
-
global uchar * src0_q,
|
| 516 |
-
global half * src0_d,
|
| 517 |
-
global float * src1,
|
| 518 |
-
ulong offset1,
|
| 519 |
-
global float * dst,
|
| 520 |
-
ulong offsetd,
|
| 521 |
-
int ne00,
|
| 522 |
-
int ne01,
|
| 523 |
-
int ne02,
|
| 524 |
-
int ne10,
|
| 525 |
-
int ne12,
|
| 526 |
-
int ne0,
|
| 527 |
-
int ne1,
|
| 528 |
-
int r2,
|
| 529 |
-
int r3
|
| 530 |
-
) {
|
| 531 |
-
src1 = (global float*)((global char*)src1 + offset1);
|
| 532 |
-
dst = (global float*)((global char*)dst + offsetd);
|
| 533 |
-
|
| 534 |
-
mul_mat_q_n_f32_1d_16x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
| 535 |
-
}
|
| 536 |
-
|
| 537 |
-
//------------------------------------------------------------------------------
|
| 538 |
-
// kernel_mul_mat_q4_0_f32_flat_v0
|
| 539 |
-
//------------------------------------------------------------------------------
|
| 540 |
-
inline float block_q_4_0_dot_y_flat_v2(
|
| 541 |
-
half x,
|
| 542 |
-
half d,
|
| 543 |
-
float sumy,
|
| 544 |
-
float4 yl
|
| 545 |
-
) {
|
| 546 |
-
uchar2 q = as_uchar2(x);
|
| 547 |
-
float acc = 0.0f;
|
| 548 |
-
|
| 549 |
-
acc += (q.s0 & 0x0F) * yl.s0;
|
| 550 |
-
acc += (q.s1 & 0x0F) * yl.s1;
|
| 551 |
-
|
| 552 |
-
acc += (q.s0 & 0xF0) * yl.s2;
|
| 553 |
-
acc += (q.s1 & 0xF0) * yl.s3;
|
| 554 |
-
|
| 555 |
-
return d * (sumy * -8.f + acc);;
|
| 556 |
-
}
|
| 557 |
-
|
| 558 |
-
inline float block_q_4_0_dot_y_flat_v4(
|
| 559 |
-
float x,
|
| 560 |
-
half d,
|
| 561 |
-
float sumy,
|
| 562 |
-
float8 yl
|
| 563 |
-
) {
|
| 564 |
-
uchar4 q = as_uchar4(x);
|
| 565 |
-
float acc = 0.0f;
|
| 566 |
-
|
| 567 |
-
acc += (q.s0 & 0x0F) * yl.s0;
|
| 568 |
-
acc += (q.s1 & 0x0F) * yl.s1;
|
| 569 |
-
acc += (q.s2 & 0x0F) * yl.s2;
|
| 570 |
-
acc += (q.s3 & 0x0F) * yl.s3;
|
| 571 |
-
|
| 572 |
-
acc += (q.s0 & 0xF0) * yl.s4;
|
| 573 |
-
acc += (q.s1 & 0xF0) * yl.s5;
|
| 574 |
-
acc += (q.s2 & 0xF0) * yl.s6;
|
| 575 |
-
acc += (q.s3 & 0xF0) * yl.s7;
|
| 576 |
-
|
| 577 |
-
return d * (sumy * -8.f + acc);;
|
| 578 |
-
}
|
| 579 |
-
|
| 580 |
-
inline float block_q_4_0_dot_y_flat_v8(
|
| 581 |
-
float2 x,
|
| 582 |
-
half d,
|
| 583 |
-
float sumy,
|
| 584 |
-
float16 yl
|
| 585 |
-
) {
|
| 586 |
-
uchar8 q = as_uchar8(x);
|
| 587 |
-
float acc = 0.0f;
|
| 588 |
-
|
| 589 |
-
acc += (q.s0 & 0x0F) * yl.s0;
|
| 590 |
-
acc += (q.s1 & 0x0F) * yl.s1;
|
| 591 |
-
acc += (q.s2 & 0x0F) * yl.s2;
|
| 592 |
-
acc += (q.s3 & 0x0F) * yl.s3;
|
| 593 |
-
acc += (q.s4 & 0x0F) * yl.s4;
|
| 594 |
-
acc += (q.s5 & 0x0F) * yl.s5;
|
| 595 |
-
acc += (q.s6 & 0x0F) * yl.s6;
|
| 596 |
-
acc += (q.s7 & 0x0F) * yl.s7;
|
| 597 |
-
|
| 598 |
-
acc += (q.s0 & 0xF0) * yl.s8;
|
| 599 |
-
acc += (q.s1 & 0xF0) * yl.s9;
|
| 600 |
-
acc += (q.s2 & 0xF0) * yl.sa;
|
| 601 |
-
acc += (q.s3 & 0xF0) * yl.sb;
|
| 602 |
-
acc += (q.s4 & 0xF0) * yl.sc;
|
| 603 |
-
acc += (q.s5 & 0xF0) * yl.sd;
|
| 604 |
-
acc += (q.s6 & 0xF0) * yl.se;
|
| 605 |
-
acc += (q.s7 & 0xF0) * yl.sf;
|
| 606 |
-
|
| 607 |
-
return d * (sumy * -8.f + acc);;
|
| 608 |
-
}
|
| 609 |
-
|
| 610 |
-
#undef N_DST
|
| 611 |
-
#undef N_SIMDGROUP
|
| 612 |
-
#undef N_SIMDWIDTH
|
| 613 |
-
|
| 614 |
-
#ifdef INTEL_GPU
|
| 615 |
-
#define THREADS_PER_BLK 4 // Number of threads per block, or each thread process 1/THREADS_PER_BLK of a block
|
| 616 |
-
#define N_DST 4
|
| 617 |
-
#define N_SIMDGROUP 1
|
| 618 |
-
#define N_SIMDWIDTH 16
|
| 619 |
-
#elif defined (ADRENO_GPU)
|
| 620 |
-
#define THREADS_PER_BLK 4
|
| 621 |
-
#define N_DST 4
|
| 622 |
-
#define N_SIMDGROUP 1
|
| 623 |
-
#define N_SIMDWIDTH 64
|
| 624 |
-
#endif
|
| 625 |
-
|
| 626 |
-
#if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
|
| 627 |
-
# define ACT_TY float16
|
| 628 |
-
# define Q_BLK_LD_TY float2
|
| 629 |
-
# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v8
|
| 630 |
-
#elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
|
| 631 |
-
# define ACT_TY float8
|
| 632 |
-
# define Q_BLK_LD_TY float
|
| 633 |
-
# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v4
|
| 634 |
-
#elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
|
| 635 |
-
# define ACT_TY float4
|
| 636 |
-
# define Q_BLK_LD_TY half
|
| 637 |
-
# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v2
|
| 638 |
-
#endif
|
| 639 |
-
|
| 640 |
-
#define BTYES_PER_THREAD_IN_BLK (QK4_0/2/THREADS_PER_BLK)
|
| 641 |
-
|
| 642 |
-
#if N_DST == 2
|
| 643 |
-
# define SUM_TY float2
|
| 644 |
-
#elif N_DST == 4
|
| 645 |
-
# define SUM_TY float4
|
| 646 |
-
#elif N_DST == 8
|
| 647 |
-
# define SUM_TY float8
|
| 648 |
-
#elif N_DST == 16
|
| 649 |
-
# define SUM_TY float16
|
| 650 |
-
#endif
|
| 651 |
-
|
| 652 |
-
#ifdef INTEL_GPU
|
| 653 |
-
REQD_SUBGROUP_SIZE_16
|
| 654 |
-
#elif defined (ADRENO_GPU)
|
| 655 |
-
REQD_SUBGROUP_SIZE_64
|
| 656 |
-
#endif
|
| 657 |
-
kernel void kernel_mul_mat_q4_0_f32_flat_v0(
|
| 658 |
-
global uchar * src0_q,
|
| 659 |
-
global half * src0_d,
|
| 660 |
-
global float * src1,
|
| 661 |
-
ulong offset1,
|
| 662 |
-
global float * dst,
|
| 663 |
-
ulong offsetd,
|
| 664 |
-
int ne00,
|
| 665 |
-
int ne01,
|
| 666 |
-
int ne02,
|
| 667 |
-
int ne10,
|
| 668 |
-
int ne12,
|
| 669 |
-
int ne0,
|
| 670 |
-
int ne1,
|
| 671 |
-
int r2,
|
| 672 |
-
int r3
|
| 673 |
-
) {
|
| 674 |
-
src1 = (global float*)((global char*)src1 + offset1);
|
| 675 |
-
dst = (global float*)((global char*)dst + offsetd);
|
| 676 |
-
|
| 677 |
-
const int nb = ne00/QK4_0;
|
| 678 |
-
|
| 679 |
-
int r0 = get_group_id(0);
|
| 680 |
-
int r1 = get_group_id(1);
|
| 681 |
-
int im = get_group_id(2);
|
| 682 |
-
|
| 683 |
-
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
| 684 |
-
|
| 685 |
-
int i12 = im%ne12;
|
| 686 |
-
int i13 = im/ne12;
|
| 687 |
-
|
| 688 |
-
// The number of scales is the same as the number of blocks.
|
| 689 |
-
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
| 690 |
-
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
| 691 |
-
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
| 692 |
-
|
| 693 |
-
global uchar * x = (global uchar *) src0_q + offset0_q;
|
| 694 |
-
global half * d = (global half *) src0_d + offset0_d;
|
| 695 |
-
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 696 |
-
|
| 697 |
-
int ix = get_sub_group_local_id()/THREADS_PER_BLK;
|
| 698 |
-
int il = get_sub_group_local_id()%THREADS_PER_BLK;
|
| 699 |
-
|
| 700 |
-
global float * yb = y + ix*QK4_0 + BTYES_PER_THREAD_IN_BLK*il;
|
| 701 |
-
|
| 702 |
-
// Registers for caching activation
|
| 703 |
-
ACT_TY yl = 0.f;
|
| 704 |
-
|
| 705 |
-
// Registers for caching quants
|
| 706 |
-
Q_BLK_LD_TY q_blk_0 = 0, q_blk_1 = 0;
|
| 707 |
-
#if N_DST == 4 || N_DST == 8 || N_DST == 16
|
| 708 |
-
Q_BLK_LD_TY q_blk_2 = 0, q_blk_3 = 0;
|
| 709 |
-
#endif
|
| 710 |
-
#if N_DST == 8 || N_DST == 16
|
| 711 |
-
Q_BLK_LD_TY q_blk_4 = 0, q_blk_5 = 0, q_blk_6 = 0, q_blk_7 = 0;
|
| 712 |
-
#endif
|
| 713 |
-
|
| 714 |
-
// Partial sum
|
| 715 |
-
SUM_TY sumf = 0.f;
|
| 716 |
-
|
| 717 |
-
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/THREADS_PER_BLK) {
|
| 718 |
-
float sumy = 0.f;
|
| 719 |
-
|
| 720 |
-
q_blk_0 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 0*nb*QK4_0/2);
|
| 721 |
-
q_blk_1 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 1*nb*QK4_0/2);
|
| 722 |
-
#if N_DST == 4 || N_DST == 8 || N_DST == 16
|
| 723 |
-
q_blk_2 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 2*nb*QK4_0/2);
|
| 724 |
-
q_blk_3 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 3*nb*QK4_0/2);
|
| 725 |
-
#endif
|
| 726 |
-
#if N_DST == 8 || N_DST == 16
|
| 727 |
-
q_blk_4 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 4*nb*QK4_0/2));
|
| 728 |
-
q_blk_5 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 5*nb*QK4_0/2));
|
| 729 |
-
q_blk_6 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 6*nb*QK4_0/2));
|
| 730 |
-
q_blk_7 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 7*nb*QK4_0/2));
|
| 731 |
-
#endif
|
| 732 |
-
|
| 733 |
-
// Load activation
|
| 734 |
-
#if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
|
| 735 |
-
yl.s01234567 = *(global float8 *)(yb);
|
| 736 |
-
yl.s89abcdef = *(global float8 *)(yb + 16);
|
| 737 |
-
|
| 738 |
-
sumy += yl.s0;
|
| 739 |
-
sumy += yl.s1;
|
| 740 |
-
sumy += yl.s2;
|
| 741 |
-
sumy += yl.s3;
|
| 742 |
-
sumy += yl.s4;
|
| 743 |
-
sumy += yl.s5;
|
| 744 |
-
sumy += yl.s6;
|
| 745 |
-
sumy += yl.s7;
|
| 746 |
-
sumy += yl.s8; yl.s8 /= 16.f;
|
| 747 |
-
sumy += yl.s9; yl.s9 /= 16.f;
|
| 748 |
-
sumy += yl.sa; yl.sa /= 16.f;
|
| 749 |
-
sumy += yl.sb; yl.sb /= 16.f;
|
| 750 |
-
sumy += yl.sc; yl.sc /= 16.f;
|
| 751 |
-
sumy += yl.sd; yl.sd /= 16.f;
|
| 752 |
-
sumy += yl.se; yl.se /= 16.f;
|
| 753 |
-
sumy += yl.sf; yl.sf /= 16.f;
|
| 754 |
-
#elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
|
| 755 |
-
yl.s0123 = *(global float4 *)(yb);
|
| 756 |
-
yl.s4567 = *(global float4 *)(yb + 16);
|
| 757 |
-
|
| 758 |
-
sumy += yl.s0;
|
| 759 |
-
sumy += yl.s1;
|
| 760 |
-
sumy += yl.s2;
|
| 761 |
-
sumy += yl.s3;
|
| 762 |
-
sumy += yl.s4; yl.s4 /= 16.f;
|
| 763 |
-
sumy += yl.s5; yl.s5 /= 16.f;
|
| 764 |
-
sumy += yl.s6; yl.s6 /= 16.f;
|
| 765 |
-
sumy += yl.s7; yl.s7 /= 16.f;
|
| 766 |
-
#elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
|
| 767 |
-
yl.s01 = *(global float2 *)(yb);
|
| 768 |
-
yl.s23 = *(global float2 *)(yb + 16);
|
| 769 |
-
|
| 770 |
-
sumy += yl.s0;
|
| 771 |
-
sumy += yl.s1;
|
| 772 |
-
sumy += yl.s2; yl.s2 /= 16.f;
|
| 773 |
-
sumy += yl.s3; yl.s3 /= 16.f;
|
| 774 |
-
#endif
|
| 775 |
-
|
| 776 |
-
sumf.s0 += block_q_4_0_dot_y_flat(q_blk_0, *(d + ib + 0*nb), sumy, yl);
|
| 777 |
-
sumf.s1 += block_q_4_0_dot_y_flat(q_blk_1, *(d + ib + 1*nb), sumy, yl);
|
| 778 |
-
#if N_DST == 4 || N_DST == 8 || N_DST == 16
|
| 779 |
-
sumf.s2 += block_q_4_0_dot_y_flat(q_blk_2, *(d + ib + 2*nb), sumy, yl);
|
| 780 |
-
sumf.s3 += block_q_4_0_dot_y_flat(q_blk_3, *(d + ib + 3*nb), sumy, yl);
|
| 781 |
-
#endif
|
| 782 |
-
#if N_DST == 8 || N_DST == 16
|
| 783 |
-
sumf.s4 += block_q_4_0_dot_y_flat(q_blk_4, *(d + ib + 4*nb), sumy, yl);
|
| 784 |
-
sumf.s5 += block_q_4_0_dot_y_flat(q_blk_5, *(d + ib + 5*nb), sumy, yl);
|
| 785 |
-
sumf.s6 += block_q_4_0_dot_y_flat(q_blk_6, *(d + ib + 6*nb), sumy, yl);
|
| 786 |
-
sumf.s7 += block_q_4_0_dot_y_flat(q_blk_7, *(d + ib + 7*nb), sumy, yl);
|
| 787 |
-
#endif
|
| 788 |
-
|
| 789 |
-
yb += QK4_0 * (N_SIMDWIDTH/THREADS_PER_BLK);
|
| 790 |
-
}
|
| 791 |
-
|
| 792 |
-
SUM_TY tot = (SUM_TY)(
|
| 793 |
-
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1)
|
| 794 |
-
#if N_DST == 4 || N_DST == 8 || N_DST == 16
|
| 795 |
-
, sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
|
| 796 |
-
#endif
|
| 797 |
-
#if N_DST == 8 || N_DST == 16
|
| 798 |
-
, sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5)
|
| 799 |
-
, sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
| 800 |
-
#endif
|
| 801 |
-
);
|
| 802 |
-
|
| 803 |
-
if (get_sub_group_local_id() == 0) {
|
| 804 |
-
if (first_row + 0 < ne01) {
|
| 805 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
| 806 |
-
}
|
| 807 |
-
if (first_row + 1 < ne01) {
|
| 808 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
| 809 |
-
}
|
| 810 |
-
#if N_DST == 4 || N_DST == 8 || N_DST == 16
|
| 811 |
-
if (first_row + 2 < ne01) {
|
| 812 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
| 813 |
-
}
|
| 814 |
-
if (first_row + 3 < ne01) {
|
| 815 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
| 816 |
-
}
|
| 817 |
-
#endif
|
| 818 |
-
#if N_DST == 8 || N_DST == 16
|
| 819 |
-
if (first_row + 4 < ne01) {
|
| 820 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
| 821 |
-
}
|
| 822 |
-
if (first_row + 5 < ne01) {
|
| 823 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
| 824 |
-
}
|
| 825 |
-
if (first_row + 6 < ne01) {
|
| 826 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
| 827 |
-
}
|
| 828 |
-
if (first_row + 7 < ne01) {
|
| 829 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
| 830 |
-
}
|
| 831 |
-
#endif
|
| 832 |
-
}
|
| 833 |
-
}
|
| 834 |
-
|
| 835 |
-
//------------------------------------------------------------------------------
|
| 836 |
-
// Using image1d_buffer_t
|
| 837 |
-
|
| 838 |
-
#if defined(cl_qcom_subgroup_shuffle)
|
| 839 |
-
#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
|
| 840 |
-
float qcom_sub_group_reduce_add(float sum) {
|
| 841 |
-
sum += qcom_sub_group_shuffle_down(sum, 32, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
|
| 842 |
-
sum += qcom_sub_group_shuffle_down(sum, 16, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
|
| 843 |
-
sum += qcom_sub_group_shuffle_down(sum, 8, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
|
| 844 |
-
sum += qcom_sub_group_shuffle_down(sum, 4, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
|
| 845 |
-
sum += qcom_sub_group_shuffle_down(sum, 2, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
|
| 846 |
-
sum += qcom_sub_group_shuffle_down(sum, 1, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
|
| 847 |
-
return sum;
|
| 848 |
-
}
|
| 849 |
-
#define sub_group_reduce_add qcom_sub_group_reduce_add
|
| 850 |
-
#else
|
| 851 |
-
#define sub_group_reduce_add sub_group_reduce_add
|
| 852 |
-
#endif
|
| 853 |
-
|
| 854 |
-
#undef THREADS_PER_BLK
|
| 855 |
-
#undef N_DST
|
| 856 |
-
#undef N_SIMDGROUP
|
| 857 |
-
#undef N_SIMDWIDTH
|
| 858 |
-
|
| 859 |
-
#ifdef INTEL_GPU
|
| 860 |
-
#define THREADS_PER_BLK 4 // Number of threads per block, or each thread process 1/THREADS_PER_BLK of a block
|
| 861 |
-
#define N_DST 4
|
| 862 |
-
#define N_SIMDGROUP 1
|
| 863 |
-
#define N_SIMDWIDTH 16
|
| 864 |
-
#elif defined (ADRENO_GPU)
|
| 865 |
-
#define THREADS_PER_BLK 4
|
| 866 |
-
#define N_DST 4
|
| 867 |
-
#define N_SIMDGROUP 1
|
| 868 |
-
#define N_SIMDWIDTH 64
|
| 869 |
-
#endif
|
| 870 |
-
|
| 871 |
-
#if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
|
| 872 |
-
# define ACT_TY float16
|
| 873 |
-
# define Q_BLK_LD_TY float2
|
| 874 |
-
# define EXTRACT_BLK_DATA(tmp, part) *((float2*)&tmp + part)
|
| 875 |
-
# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v8
|
| 876 |
-
#elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
|
| 877 |
-
# define ACT_TY float8
|
| 878 |
-
# define Q_BLK_LD_TY float
|
| 879 |
-
# define EXTRACT_BLK_DATA(tmp, part) *((float*)&tmp + part)
|
| 880 |
-
# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v4
|
| 881 |
-
#elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
|
| 882 |
-
# define ACT_TY float4
|
| 883 |
-
# define Q_BLK_LD_TY half
|
| 884 |
-
# define EXTRACT_BLK_DATA(tmp, part) *((half*)&tmp + part)
|
| 885 |
-
# define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v2
|
| 886 |
-
#endif
|
| 887 |
-
|
| 888 |
-
#define BTYES_PER_THREAD_IN_BLK (QK4_0/2/THREADS_PER_BLK)
|
| 889 |
-
|
| 890 |
-
#if N_DST == 2
|
| 891 |
-
# define SUM_TY float2
|
| 892 |
-
#elif N_DST == 4
|
| 893 |
-
# define SUM_TY float4
|
| 894 |
-
#elif N_DST == 8
|
| 895 |
-
# define SUM_TY float8
|
| 896 |
-
#elif N_DST == 16
|
| 897 |
-
# define SUM_TY float16
|
| 898 |
-
#endif
|
| 899 |
-
|
| 900 |
-
#ifdef INTEL_GPU
|
| 901 |
-
REQD_SUBGROUP_SIZE_16
|
| 902 |
-
#elif defined (ADRENO_GPU)
|
| 903 |
-
REQD_SUBGROUP_SIZE_64
|
| 904 |
-
#endif
|
| 905 |
-
kernel void kernel_mul_mat_q4_0_f32_flat_img_v0(
|
| 906 |
-
read_only image1d_buffer_t src0_q,
|
| 907 |
-
read_only image1d_buffer_t src0_d,
|
| 908 |
-
global float * src1,
|
| 909 |
-
ulong offset1,
|
| 910 |
-
global float * dst,
|
| 911 |
-
ulong offsetd,
|
| 912 |
-
int ne00,
|
| 913 |
-
int ne01,
|
| 914 |
-
int ne02,
|
| 915 |
-
int ne10,
|
| 916 |
-
int ne12,
|
| 917 |
-
int ne0,
|
| 918 |
-
int ne1,
|
| 919 |
-
int r2,
|
| 920 |
-
int r3
|
| 921 |
-
) {
|
| 922 |
-
src1 = (global float*)((global char*)src1 + offset1);
|
| 923 |
-
dst = (global float*)((global char*)dst + offsetd);
|
| 924 |
-
|
| 925 |
-
const int nb = ne00/QK4_0;
|
| 926 |
-
|
| 927 |
-
int r0 = get_group_id(0);
|
| 928 |
-
int r1 = get_group_id(1);
|
| 929 |
-
int im = get_group_id(2);
|
| 930 |
-
|
| 931 |
-
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
| 932 |
-
|
| 933 |
-
int i12 = im%ne12;
|
| 934 |
-
int i13 = im/ne12;
|
| 935 |
-
|
| 936 |
-
// The number of scales is the same as the number of blocks.
|
| 937 |
-
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
| 938 |
-
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
| 939 |
-
ulong offset0_q = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
| 940 |
-
|
| 941 |
-
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 942 |
-
|
| 943 |
-
int ix = get_sub_group_local_id()/THREADS_PER_BLK;
|
| 944 |
-
int il = get_sub_group_local_id()%THREADS_PER_BLK;
|
| 945 |
-
|
| 946 |
-
global float * yb = y + ix*QK4_0 + BTYES_PER_THREAD_IN_BLK*il;
|
| 947 |
-
|
| 948 |
-
// Registers for caching activation
|
| 949 |
-
ACT_TY yl = 0.f;
|
| 950 |
-
|
| 951 |
-
// Registers for caching quants
|
| 952 |
-
Q_BLK_LD_TY q_blk_0 = 0, q_blk_1 = 0;
|
| 953 |
-
#if N_DST == 4 || N_DST == 8 || N_DST == 16
|
| 954 |
-
Q_BLK_LD_TY q_blk_2 = 0, q_blk_3 = 0;
|
| 955 |
-
#endif
|
| 956 |
-
#if N_DST == 8 || N_DST == 16
|
| 957 |
-
Q_BLK_LD_TY q_blk_4 = 0, q_blk_5 = 0, q_blk_6 = 0, q_blk_7 = 0;
|
| 958 |
-
#endif
|
| 959 |
-
|
| 960 |
-
// Partial sum
|
| 961 |
-
SUM_TY sumf = 0.f;
|
| 962 |
-
|
| 963 |
-
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/THREADS_PER_BLK) {
|
| 964 |
-
float sumy = 0.f;;
|
| 965 |
-
|
| 966 |
-
float4 tmp;
|
| 967 |
-
tmp = read_imagef(src0_q, offset0_q + ib + 0*nb);
|
| 968 |
-
q_blk_0 = EXTRACT_BLK_DATA(tmp, il);
|
| 969 |
-
tmp = read_imagef(src0_q, offset0_q + ib + 1*nb);
|
| 970 |
-
q_blk_1 = EXTRACT_BLK_DATA(tmp, il);
|
| 971 |
-
#if N_DST == 4 || N_DST == 8 || N_DST == 16
|
| 972 |
-
tmp = read_imagef(src0_q, offset0_q + ib + 2*nb);
|
| 973 |
-
q_blk_2 = EXTRACT_BLK_DATA(tmp, il);
|
| 974 |
-
tmp = read_imagef(src0_q, offset0_q + ib + 3*nb);
|
| 975 |
-
q_blk_3 = EXTRACT_BLK_DATA(tmp, il);
|
| 976 |
-
#endif
|
| 977 |
-
#if N_DST == 8 || N_DST == 16
|
| 978 |
-
tmp = read_imagef(src0_q, offset0_q + ib + 4*nb);
|
| 979 |
-
q_blk_4 = EXTRACT_BLK_DATA(tmp, il);
|
| 980 |
-
tmp = read_imagef(src0_q, offset0_q + ib + 5*nb);
|
| 981 |
-
q_blk_5 = EXTRACT_BLK_DATA(tmp, il);
|
| 982 |
-
tmp = read_imagef(src0_q, offset0_q + ib + 6*nb);
|
| 983 |
-
q_blk_6 = EXTRACT_BLK_DATA(tmp, il);
|
| 984 |
-
tmp = read_imagef(src0_q, offset0_q + ib + 7*nb);
|
| 985 |
-
q_blk_7 = EXTRACT_BLK_DATA(tmp, il);
|
| 986 |
-
#endif
|
| 987 |
-
|
| 988 |
-
// Load activation
|
| 989 |
-
#if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
|
| 990 |
-
yl.s01234567 = *(global float8 *)(yb);
|
| 991 |
-
yl.s89abcdef = *(global float8 *)(yb + 16);
|
| 992 |
-
|
| 993 |
-
sumy += yl.s0;
|
| 994 |
-
sumy += yl.s1;
|
| 995 |
-
sumy += yl.s2;
|
| 996 |
-
sumy += yl.s3;
|
| 997 |
-
sumy += yl.s4;
|
| 998 |
-
sumy += yl.s5;
|
| 999 |
-
sumy += yl.s6;
|
| 1000 |
-
sumy += yl.s7;
|
| 1001 |
-
sumy += yl.s8; yl.s8 /= 16.f;
|
| 1002 |
-
sumy += yl.s9; yl.s9 /= 16.f;
|
| 1003 |
-
sumy += yl.sa; yl.sa /= 16.f;
|
| 1004 |
-
sumy += yl.sb; yl.sb /= 16.f;
|
| 1005 |
-
sumy += yl.sc; yl.sc /= 16.f;
|
| 1006 |
-
sumy += yl.sd; yl.sd /= 16.f;
|
| 1007 |
-
sumy += yl.se; yl.se /= 16.f;
|
| 1008 |
-
sumy += yl.sf; yl.sf /= 16.f;
|
| 1009 |
-
#elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
|
| 1010 |
-
yl.s0123 = *(global float4 *)(yb);
|
| 1011 |
-
yl.s4567 = *(global float4 *)(yb + 16);
|
| 1012 |
-
|
| 1013 |
-
sumy += yl.s0;
|
| 1014 |
-
sumy += yl.s1;
|
| 1015 |
-
sumy += yl.s2;
|
| 1016 |
-
sumy += yl.s3;
|
| 1017 |
-
sumy += yl.s4; yl.s4 /= 16.f;
|
| 1018 |
-
sumy += yl.s5; yl.s5 /= 16.f;
|
| 1019 |
-
sumy += yl.s6; yl.s6 /= 16.f;
|
| 1020 |
-
sumy += yl.s7; yl.s7 /= 16.f;
|
| 1021 |
-
#elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
|
| 1022 |
-
yl.s01 = *(global float2 *)(yb);
|
| 1023 |
-
yl.s23 = *(global float2 *)(yb + 16);
|
| 1024 |
-
|
| 1025 |
-
sumy += yl.s0;
|
| 1026 |
-
sumy += yl.s1;
|
| 1027 |
-
sumy += yl.s2; yl.s2 /= 16.f;
|
| 1028 |
-
sumy += yl.s3; yl.s3 /= 16.f;
|
| 1029 |
-
#endif
|
| 1030 |
-
|
| 1031 |
-
sumf.s0 += block_q_4_0_dot_y_flat(q_blk_0, read_imageh(src0_d, offset0_d + ib + 0*nb).s0, sumy, yl);
|
| 1032 |
-
sumf.s1 += block_q_4_0_dot_y_flat(q_blk_1, read_imageh(src0_d, offset0_d + ib + 1*nb).s0, sumy, yl);
|
| 1033 |
-
#if N_DST == 4 || N_DST == 8 || N_DST == 16
|
| 1034 |
-
sumf.s2 += block_q_4_0_dot_y_flat(q_blk_2, read_imageh(src0_d, offset0_d + ib + 2*nb).s0, sumy, yl);
|
| 1035 |
-
sumf.s3 += block_q_4_0_dot_y_flat(q_blk_3, read_imageh(src0_d, offset0_d + ib + 3*nb).s0, sumy, yl);
|
| 1036 |
-
#endif
|
| 1037 |
-
#if N_DST == 8 || N_DST == 16
|
| 1038 |
-
sumf.s4 += block_q_4_0_dot_y_flat(q_blk_4, read_imageh(src0_d, offset0_d + ib + 4*nb).s0, sumy, yl);
|
| 1039 |
-
sumf.s5 += block_q_4_0_dot_y_flat(q_blk_5, read_imageh(src0_d, offset0_d + ib + 5*nb).s0, sumy, yl);
|
| 1040 |
-
sumf.s6 += block_q_4_0_dot_y_flat(q_blk_6, read_imageh(src0_d, offset0_d + ib + 6*nb).s0, sumy, yl);
|
| 1041 |
-
sumf.s7 += block_q_4_0_dot_y_flat(q_blk_7, read_imageh(src0_d, offset0_d + ib + 7*nb).s0, sumy, yl);
|
| 1042 |
-
#endif
|
| 1043 |
-
|
| 1044 |
-
yb += QK4_0 * (N_SIMDWIDTH/THREADS_PER_BLK);
|
| 1045 |
-
}
|
| 1046 |
-
|
| 1047 |
-
SUM_TY tot = (SUM_TY)(
|
| 1048 |
-
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1)
|
| 1049 |
-
#if N_DST == 4 || N_DST == 8 || N_DST == 16
|
| 1050 |
-
, sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
|
| 1051 |
-
#endif
|
| 1052 |
-
#if N_DST == 8 || N_DST == 16
|
| 1053 |
-
, sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5)
|
| 1054 |
-
, sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
| 1055 |
-
#endif
|
| 1056 |
-
);
|
| 1057 |
-
|
| 1058 |
-
if (get_sub_group_local_id() == 0) {
|
| 1059 |
-
if (first_row + 0 < ne01) {
|
| 1060 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
| 1061 |
-
}
|
| 1062 |
-
if (first_row + 1 < ne01) {
|
| 1063 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
| 1064 |
-
}
|
| 1065 |
-
#if N_DST == 4 || N_DST == 8 || N_DST == 16
|
| 1066 |
-
if (first_row + 2 < ne01) {
|
| 1067 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
| 1068 |
-
}
|
| 1069 |
-
if (first_row + 3 < ne01) {
|
| 1070 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
| 1071 |
-
}
|
| 1072 |
-
#endif
|
| 1073 |
-
#if N_DST == 8 || N_DST == 16
|
| 1074 |
-
if (first_row + 4 < ne01) {
|
| 1075 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
| 1076 |
-
}
|
| 1077 |
-
if (first_row + 5 < ne01) {
|
| 1078 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
| 1079 |
-
}
|
| 1080 |
-
if (first_row + 6 < ne01) {
|
| 1081 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
| 1082 |
-
}
|
| 1083 |
-
if (first_row + 7 < ne01) {
|
| 1084 |
-
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
| 1085 |
-
}
|
| 1086 |
-
#endif
|
| 1087 |
-
}
|
| 1088 |
-
}
|
| 1089 |
-
|
| 1090 |
-
//------------------------------------------------------------------------------
|
| 1091 |
-
// kernel_mul_mv_q6_K_f32
|
| 1092 |
-
//------------------------------------------------------------------------------
|
| 1093 |
-
|
| 1094 |
-
#undef N_DST
|
| 1095 |
-
#undef N_SIMDGROUP
|
| 1096 |
-
#undef N_SIMDWIDTH
|
| 1097 |
-
|
| 1098 |
-
#ifdef INTEL_GPU
|
| 1099 |
-
#define N_DST 1 // number of rows each SIMD group works on
|
| 1100 |
-
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
| 1101 |
-
#define N_SIMDWIDTH 16 // SIMD group size
|
| 1102 |
-
#elif defined (ADRENO_GPU)
|
| 1103 |
-
#define N_DST 1
|
| 1104 |
-
#define N_SIMDGROUP 2
|
| 1105 |
-
#define N_SIMDWIDTH 64
|
| 1106 |
-
#endif
|
| 1107 |
-
|
| 1108 |
-
#define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
|
| 1109 |
-
|
| 1110 |
-
#ifdef INTEL_GPU
|
| 1111 |
-
REQD_SUBGROUP_SIZE_16
|
| 1112 |
-
#elif defined (ADRENO_GPU)
|
| 1113 |
-
REQD_SUBGROUP_SIZE_64
|
| 1114 |
-
#endif
|
| 1115 |
-
kernel void kernel_mul_mv_q6_K_f32(
|
| 1116 |
-
global void * src0,
|
| 1117 |
-
ulong offset0,
|
| 1118 |
-
global float * src1,
|
| 1119 |
-
ulong offset1,
|
| 1120 |
-
global float * dst,
|
| 1121 |
-
ulong offsetd,
|
| 1122 |
-
int ne00,
|
| 1123 |
-
int ne01,
|
| 1124 |
-
int ne02,
|
| 1125 |
-
int ne10,
|
| 1126 |
-
int ne12,
|
| 1127 |
-
int ne0,
|
| 1128 |
-
int ne1,
|
| 1129 |
-
int r2,
|
| 1130 |
-
int r3
|
| 1131 |
-
) {
|
| 1132 |
-
src0 = (global void*)((global char*)src0 + offset0);
|
| 1133 |
-
src1 = (global float*)((global char*)src1 + offset1);
|
| 1134 |
-
dst = (global float*)((global char*)dst + offsetd);
|
| 1135 |
-
|
| 1136 |
-
uchar kmask1 = 0x03;
|
| 1137 |
-
uchar kmask2 = 0x0C;
|
| 1138 |
-
uchar kmask3 = 0x30;
|
| 1139 |
-
uchar kmask4 = 0xC0;
|
| 1140 |
-
|
| 1141 |
-
int nb = ne00/QK_K;
|
| 1142 |
-
|
| 1143 |
-
int r0 = get_group_id(0);
|
| 1144 |
-
int r1 = get_group_id(1);
|
| 1145 |
-
int im = get_group_id(2);
|
| 1146 |
-
|
| 1147 |
-
int row = N_SIMDGROUP * r0 + get_sub_group_id();
|
| 1148 |
-
|
| 1149 |
-
int i12 = im%ne12;
|
| 1150 |
-
int i13 = im/ne12;
|
| 1151 |
-
|
| 1152 |
-
ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
| 1153 |
-
|
| 1154 |
-
global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
|
| 1155 |
-
global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 1156 |
-
|
| 1157 |
-
float sumf = 0;
|
| 1158 |
-
|
| 1159 |
-
// For Q6_K quantization, 16 values forms a subblock, 16 subblock forms a
|
| 1160 |
-
// block. Values in a subblock shares a scale that is quantized with 8 bits;
|
| 1161 |
-
// the entire block shares a single floating point scale.
|
| 1162 |
-
// For work distribution, each thread processes a subblock (16 weights), hence
|
| 1163 |
-
// 16 threads process a (super) block -- a subgroup thus handles SIMDWIDTH/16
|
| 1164 |
-
// (super) blocks -- this is the block stride.
|
| 1165 |
-
// The 16 threads that process a (super) block are split into 2 portions, each has
|
| 1166 |
-
// 8 threads; each portion works on 8 subblocks.
|
| 1167 |
-
// For subgroup of 16 threads, the entire subgroup works on a single (super) block
|
| 1168 |
-
// before moving to the next (super) block. Thread0 - thread7 work on the
|
| 1169 |
-
// first 8 subblocks; thread8 - thread15 works on the last 8 subblocks.
|
| 1170 |
-
// Thread0 - thread3 work on subblocks 0, 2, 4, 6; thread4 - thread7 work on
|
| 1171 |
-
// subblocks 1, 3, 5, 7. Each thread does not work on an entire subblock, but
|
| 1172 |
-
// works on a total of 16 weight values.
|
| 1173 |
-
int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
|
| 1174 |
-
int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
|
| 1175 |
-
int ip = tid/8; // first or second half of (super) block (0 or 1)
|
| 1176 |
-
int il = tid%8; // each half has 8 parts, one per scale
|
| 1177 |
-
int n = 4; // 4 scales at a time (and 4 sums)
|
| 1178 |
-
int l0 = n*il; // offset into half-block, 0..28
|
| 1179 |
-
int is = 8*ip + l0/16; // 0, 1, 8, 9
|
| 1180 |
-
|
| 1181 |
-
int y_offset = 128*ip + l0;
|
| 1182 |
-
int q_offset_l = 64*ip + l0;
|
| 1183 |
-
int q_offset_h = 32*ip + l0;
|
| 1184 |
-
|
| 1185 |
-
for (int i = ix; i < nb; i += BLOCK_STRIDE) {
|
| 1186 |
-
|
| 1187 |
-
global uint8_t * q1 = x[i].ql + q_offset_l;
|
| 1188 |
-
global uint8_t * q2 = q1 + QK_K/8;
|
| 1189 |
-
global uint8_t * qh = x[i].qh + q_offset_h;
|
| 1190 |
-
global int8_t * sc = x[i].scales + is;
|
| 1191 |
-
|
| 1192 |
-
global float * y = yy + i * QK_K + y_offset;
|
| 1193 |
-
|
| 1194 |
-
float dall = x[i].d;
|
| 1195 |
-
|
| 1196 |
-
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
| 1197 |
-
|
| 1198 |
-
sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & kmask1) << 4)) - 32.f);
|
| 1199 |
-
sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & kmask2) << 2)) - 32.f);
|
| 1200 |
-
sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & kmask3) << 0)) - 32.f);
|
| 1201 |
-
sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & kmask4) >> 2)) - 32.f);
|
| 1202 |
-
|
| 1203 |
-
sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & kmask1) << 4)) - 32.f);
|
| 1204 |
-
sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & kmask2) << 2)) - 32.f);
|
| 1205 |
-
sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & kmask3) << 0)) - 32.f);
|
| 1206 |
-
sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & kmask4) >> 2)) - 32.f);
|
| 1207 |
-
|
| 1208 |
-
sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & kmask1) << 4)) - 32.f);
|
| 1209 |
-
sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & kmask2) << 2)) - 32.f);
|
| 1210 |
-
sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & kmask3) << 0)) - 32.f);
|
| 1211 |
-
sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & kmask4) >> 2)) - 32.f);
|
| 1212 |
-
|
| 1213 |
-
sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & kmask1) << 4)) - 32.f);
|
| 1214 |
-
sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & kmask2) << 2)) - 32.f);
|
| 1215 |
-
sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & kmask3) << 0)) - 32.f);
|
| 1216 |
-
sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & kmask4) >> 2)) - 32.f);
|
| 1217 |
-
|
| 1218 |
-
sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
|
| 1219 |
-
}
|
| 1220 |
-
|
| 1221 |
-
float tot = sub_group_reduce_add(sumf);
|
| 1222 |
-
if (get_sub_group_local_id() == 0) {
|
| 1223 |
-
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
| 1224 |
-
}
|
| 1225 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
DELETED
|
@@ -1,139 +0,0 @@
|
|
| 1 |
-
// src0_q, src0_d, src1 are transposed as a preprocessing step
|
| 2 |
-
// 4-bit weights are transposed in groups of 4 (unsigned short int)
|
| 3 |
-
// consider weights originally "next to each other", now "on top of each other"
|
| 4 |
-
// each fiber computes a 8x4 tile of output elements
|
| 5 |
-
// using unshuffled weights
|
| 6 |
-
|
| 7 |
-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 8 |
-
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 9 |
-
|
| 10 |
-
#ifdef cl_qcom_reqd_sub_group_size
|
| 11 |
-
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 12 |
-
#define ADRENO_GPU 1
|
| 13 |
-
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
| 14 |
-
#endif
|
| 15 |
-
|
| 16 |
-
#ifdef ADRENO_GPU
|
| 17 |
-
REQD_SUBGROUP_SIZE_128
|
| 18 |
-
#endif
|
| 19 |
-
|
| 20 |
-
kernel void kernel_mul_mat_Ab_Bi_8x4(
|
| 21 |
-
global const ushort * src0_q, // quantized A
|
| 22 |
-
global const half * src0_d, // A scales
|
| 23 |
-
__read_only image1d_buffer_t src1, // B (1d image)
|
| 24 |
-
global float * dst, // C
|
| 25 |
-
int m, // M
|
| 26 |
-
int n, // N with padding
|
| 27 |
-
int k, // K
|
| 28 |
-
int n_no_padding // N without padding
|
| 29 |
-
) {
|
| 30 |
-
|
| 31 |
-
int m_4 = m >> 2;
|
| 32 |
-
int n_4 = n >> 2;
|
| 33 |
-
|
| 34 |
-
int gy = get_global_id(0);
|
| 35 |
-
int gx = get_global_id(1);
|
| 36 |
-
int gx_2 = gx << 2;
|
| 37 |
-
|
| 38 |
-
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
|
| 39 |
-
half8 B; // registers for activations
|
| 40 |
-
half4 dequantized_weights; // registers for dequantized weights
|
| 41 |
-
__global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
|
| 42 |
-
__global const half* scale_ptr = src0_d + gx_2; // pointer for scales
|
| 43 |
-
|
| 44 |
-
for(int i=0; i<k; i+=4){ //loop through K dimension
|
| 45 |
-
|
| 46 |
-
B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
|
| 47 |
-
B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
|
| 48 |
-
|
| 49 |
-
// keep (i/4) and (i/32) in parenthesis, rounds down
|
| 50 |
-
// load 4 consecutive groups of 4 weights
|
| 51 |
-
ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
|
| 52 |
-
|
| 53 |
-
// load 4 consecutive scales
|
| 54 |
-
half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
|
| 55 |
-
|
| 56 |
-
// j=0
|
| 57 |
-
dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
|
| 58 |
-
dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
|
| 59 |
-
dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
|
| 60 |
-
dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
|
| 61 |
-
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
| 62 |
-
c1 += B * dequantized_weights.s1;
|
| 63 |
-
c2 += B * dequantized_weights.s2;
|
| 64 |
-
c3 += B * dequantized_weights.s3;
|
| 65 |
-
|
| 66 |
-
// j=1
|
| 67 |
-
B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
|
| 68 |
-
B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
|
| 69 |
-
dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
|
| 70 |
-
dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
|
| 71 |
-
dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
|
| 72 |
-
dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
|
| 73 |
-
c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
|
| 74 |
-
c1 += B * dequantized_weights.s1;
|
| 75 |
-
c2 += B * dequantized_weights.s2;
|
| 76 |
-
c3 += B * dequantized_weights.s3;
|
| 77 |
-
|
| 78 |
-
// j=2
|
| 79 |
-
B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
|
| 80 |
-
B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
|
| 81 |
-
dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
|
| 82 |
-
dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
|
| 83 |
-
dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
|
| 84 |
-
dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
|
| 85 |
-
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
| 86 |
-
c1 += B * dequantized_weights.s1;
|
| 87 |
-
c2 += B * dequantized_weights.s2;
|
| 88 |
-
c3 += B * dequantized_weights.s3;
|
| 89 |
-
|
| 90 |
-
// j=3
|
| 91 |
-
B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
|
| 92 |
-
B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
|
| 93 |
-
dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
|
| 94 |
-
dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
|
| 95 |
-
dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
|
| 96 |
-
dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
|
| 97 |
-
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
| 98 |
-
c1 += B * dequantized_weights.s1;
|
| 99 |
-
c2 += B * dequantized_weights.s2;
|
| 100 |
-
c3 += B * dequantized_weights.s3;
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
|
| 104 |
-
|
| 105 |
-
// conditional check if store is to a valid location. Required when N is not a multiple of 8
|
| 106 |
-
// if statements allow registers to be reused for each store
|
| 107 |
-
// provides a performance boost due to reduced register footprint, which increases number of concurrent waves
|
| 108 |
-
if(idx+3 < m*n_no_padding){
|
| 109 |
-
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
| 110 |
-
idx += m;
|
| 111 |
-
}
|
| 112 |
-
if(idx+3 < m*n_no_padding){
|
| 113 |
-
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
| 114 |
-
idx += m;
|
| 115 |
-
}
|
| 116 |
-
if(idx+3 < m*n_no_padding){
|
| 117 |
-
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
| 118 |
-
idx += m;
|
| 119 |
-
}
|
| 120 |
-
if(idx+3 < m*n_no_padding){
|
| 121 |
-
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
| 122 |
-
idx += m;
|
| 123 |
-
}
|
| 124 |
-
if(idx+3 < m*n_no_padding){
|
| 125 |
-
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
| 126 |
-
idx += m;
|
| 127 |
-
}
|
| 128 |
-
if(idx+3 < m*n_no_padding){
|
| 129 |
-
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
| 130 |
-
idx += m;
|
| 131 |
-
}
|
| 132 |
-
if(idx+3 < m*n_no_padding){
|
| 133 |
-
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
| 134 |
-
idx += m;
|
| 135 |
-
}
|
| 136 |
-
if(idx+3 < m*n_no_padding){
|
| 137 |
-
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
| 138 |
-
}
|
| 139 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
// 16-bit transpose, loading/storing a 4x4 tile of elements
|
| 2 |
-
|
| 3 |
-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 4 |
-
|
| 5 |
-
kernel void kernel_transpose_16(
|
| 6 |
-
__read_only image1d_buffer_t input,
|
| 7 |
-
__write_only image1d_buffer_t output,
|
| 8 |
-
const uint rows,
|
| 9 |
-
const uint cols
|
| 10 |
-
) {
|
| 11 |
-
|
| 12 |
-
const int i = get_global_id(0);
|
| 13 |
-
const int j = get_global_id(1);
|
| 14 |
-
const int i_2 = i<<2;
|
| 15 |
-
const int j_2 = j<<2;
|
| 16 |
-
|
| 17 |
-
half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
|
| 18 |
-
half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
|
| 19 |
-
half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
|
| 20 |
-
half4 temp3 = read_imageh(input, (j_2+3)*cols+i);
|
| 21 |
-
|
| 22 |
-
write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
|
| 23 |
-
write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
| 24 |
-
write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
| 25 |
-
write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
| 26 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
// 32-bit transpose, loading/storing a 4x4 tile of elements
|
| 2 |
-
|
| 3 |
-
kernel void kernel_transpose_32(
|
| 4 |
-
__read_only image1d_buffer_t input,
|
| 5 |
-
__write_only image1d_buffer_t output,
|
| 6 |
-
const uint rows,
|
| 7 |
-
const uint cols
|
| 8 |
-
) {
|
| 9 |
-
|
| 10 |
-
const int i = get_global_id(0);
|
| 11 |
-
const int j = get_global_id(1);
|
| 12 |
-
const int i_2 = i<<2;
|
| 13 |
-
const int j_2 = j<<2;
|
| 14 |
-
|
| 15 |
-
float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
|
| 16 |
-
float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
|
| 17 |
-
float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
|
| 18 |
-
float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
|
| 19 |
-
|
| 20 |
-
write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
|
| 21 |
-
write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
| 22 |
-
write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
| 23 |
-
write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
| 24 |
-
|
| 25 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
// 32-bit transpose, loading/storing a 4x4 tile of elements
|
| 2 |
-
// Only used for activations
|
| 3 |
-
// converts to FP16
|
| 4 |
-
// also adds zero padding for non multiple of 8 prompt lengths
|
| 5 |
-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 6 |
-
|
| 7 |
-
kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
|
| 8 |
-
|
| 9 |
-
const int i = get_global_id(0);
|
| 10 |
-
const int j = get_global_id(1);
|
| 11 |
-
const int i_2 = i<<2;
|
| 12 |
-
const int j_2 = j<<2;
|
| 13 |
-
half4 temp0 = {0,0,0,0}; // initialize outputs to 0
|
| 14 |
-
half4 temp1 = {0,0,0,0};
|
| 15 |
-
half4 temp2 = {0,0,0,0};
|
| 16 |
-
half4 temp3 = {0,0,0,0};
|
| 17 |
-
|
| 18 |
-
if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
|
| 19 |
-
temp0 = read_imageh(input, (j_2+0)*cols+i);
|
| 20 |
-
}
|
| 21 |
-
if((j_2+1)*cols+i*4+3 < rows*cols*16){
|
| 22 |
-
temp1 = read_imageh(input, (j_2+1)*cols+i);
|
| 23 |
-
}
|
| 24 |
-
if((j_2+2)*cols+i*4+3 < rows*cols*16){
|
| 25 |
-
temp2 = read_imageh(input, (j_2+2)*cols+i);
|
| 26 |
-
}
|
| 27 |
-
if((j_2+3)*cols+i*4+3 < rows*cols*16){
|
| 28 |
-
temp3 = read_imageh(input, (j_2+3)*cols+i);
|
| 29 |
-
}
|
| 30 |
-
|
| 31 |
-
write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
|
| 32 |
-
write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
| 33 |
-
write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
| 34 |
-
write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
| 35 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|