ggerganov commited on
Commit
adc6542
·
1 Parent(s): cac9245

opencl : remove obsolete files (skip) (ggml/1200)

Browse files
ggml/src/ggml-opencl/kernels/ggml-opencl.cl DELETED
The diff for this file is too large to render. See raw diff
 
ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl DELETED
@@ -1,106 +0,0 @@
1
- //------------------------------------------------------------------------------
2
- // This file is contains additional kernels for data conversion.
3
- // These kernels are used when loading the model, so its performance is less
4
- // important.
5
- //------------------------------------------------------------------------------
6
- #ifdef cl_khr_fp16
7
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
8
- #elif defined(cl_amd_fp16)
9
- #pragma OPENCL EXTENSION cl_amd_fp16 : enable
10
- #else
11
- #error "Half precision floating point not supportedby OpenCL implementation on your device."
12
- #endif
13
-
14
- #ifdef cl_khr_subgroups
15
- #pragma OPENCL EXTENSION cl_khr_subgroups : enable
16
- #elif defined(cl_intel_subgroups)
17
- #pragma OPENCL EXTENSION cl_intel_subgroups : enable
18
- #else
19
- #error "Subgroup not supported on your device."
20
- #endif
21
-
22
- #ifdef cl_intel_required_subgroup_size
23
- // Always use subgroup size of 32 on Intel.
24
- #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
25
- #define INTEL_GPU 1
26
- #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
27
- #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
28
- #elif defined(cl_qcom_reqd_sub_group_size)
29
- // Always use subgroups size of 64 on Adreno.
30
- #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
31
- #define ADRENO_GPU 1
32
- #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
33
- #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
34
- #else
35
- // TODO: do not know how to choose subgroup size on other GPUs.
36
- #error "Selecting subgroup size is not supported on your device."
37
- #endif
38
-
39
- #define QK4_0 32
40
- #define QR4_0 2
41
- #define QK4_1 32
42
- #define QR4_1 2
43
- #define QK5_0 32
44
- #define QR5_0 2
45
- #define QK5_1 32
46
- #define QR5_1 2
47
- #define QK8_0 32
48
- #define QR8_0 1
49
- #define QK_K 256
50
- #define K_QUANTS_PER_ITERATION 2
51
-
52
- typedef char int8_t;
53
- typedef uchar uint8_t;
54
- typedef short int16_t;
55
- typedef ushort uint16_t;
56
- typedef int int32_t;
57
- typedef uint uint32_t;
58
-
59
- //------------------------------------------------------------------------------
60
- // block_q4_0
61
- //------------------------------------------------------------------------------
62
- struct block_q4_0
63
- {
64
- half d;
65
- uint8_t qs[QK4_0 / 2];
66
- };
67
-
68
- //------------------------------------------------------------------------------
69
- // mul_vec_q_n_f32_flat_noshuffle
70
- //
71
- // This variation uses flat arrays (struct of arrays, SOA) representation for
72
- // quant tensors. It also uses non shuffled bit order for weights.
73
- //
74
- // The shuffled version is kept in the original file because moving it here
75
- // seems to result in worse performance for adreno.
76
- //------------------------------------------------------------------------------
77
-
78
- kernel void kernel_convert_block_q4_0_noshuffle(
79
- global struct block_q4_0 * src0,
80
- global uchar * dst_q,
81
- global half * dst_d
82
- ) {
83
- global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
84
- global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
85
- global half * d = (global half *) dst_d + get_global_id(0);
86
-
87
- *d = b->d;
88
- for (int i = 0; i < QK4_0/4; ++i) {
89
- uchar x0 = b->qs[2*i + 0];
90
- uchar x1 = b->qs[2*i + 1];
91
-
92
- q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
93
- q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
94
-
95
- #ifdef ADRENO_GPU
96
- // Workaround for adreno - must have the following printf statement for
97
- // the kernel to work properly. Otherwise it produces incorrect result.
98
- // convert_uchar above also seems necessary.
99
- // Compare against a large number so that it does not print anything.
100
- // get_sub_group_local_id() also works.
101
- if (get_global_id(0) == 65536*4096) {
102
- printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
103
- }
104
- #endif
105
- }
106
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl DELETED
@@ -1,268 +0,0 @@
1
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
- #pragma OPENCL EXTENSION cl_khr_subgroups : enable
3
-
4
- #ifdef cl_qcom_reqd_sub_group_size
5
- #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
6
- #define ADRENO_GPU 1
7
- #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
8
- #endif
9
-
10
- // assume
11
- #define QK4_0 32
12
- #define N_SIMDGROUP 4
13
-
14
- #define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
15
- float shared_y; \
16
- shared_y = sub_group_broadcast(y.s0, 0); \
17
- total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
18
- total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
19
- shared_y = sub_group_broadcast(y.s1, 0); \
20
- total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
21
- total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
22
- shared_y = sub_group_broadcast(y.s2, 0); \
23
- total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
24
- total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
25
- shared_y = sub_group_broadcast(y.s3, 0); \
26
- total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
27
- total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
28
- shared_y = sub_group_broadcast(y.s4, 0); \
29
- total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
30
- total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
31
- shared_y = sub_group_broadcast(y.s5, 0); \
32
- total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
33
- total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
34
- shared_y = sub_group_broadcast(y.s6, 0); \
35
- total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
36
- total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
37
- shared_y = sub_group_broadcast(y.s7, 0); \
38
- total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
39
- total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
40
- shared_y = sub_group_broadcast(y.s0, 1); \
41
- total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
42
- total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
43
- shared_y = sub_group_broadcast(y.s1, 1); \
44
- total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
45
- total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
46
- shared_y = sub_group_broadcast(y.s2, 1); \
47
- total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
48
- total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
49
- shared_y = sub_group_broadcast(y.s3, 1); \
50
- total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
51
- total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
52
- shared_y = sub_group_broadcast(y.s4, 1); \
53
- total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
54
- total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
55
- shared_y = sub_group_broadcast(y.s5, 1); \
56
- total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
57
- total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
58
- shared_y = sub_group_broadcast(y.s6, 1); \
59
- total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
60
- total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
61
- shared_y = sub_group_broadcast(y.s7, 1); \
62
- total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
63
- total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
64
-
65
-
66
- #define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
67
- shared_y = sub_group_broadcast(y.s0, 2); \
68
- total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
69
- total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
70
- shared_y = sub_group_broadcast(y.s1, 2); \
71
- total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
72
- total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
73
- shared_y = sub_group_broadcast(y.s2, 2); \
74
- total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
75
- total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
76
- shared_y = sub_group_broadcast(y.s3, 2); \
77
- total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
78
- total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
79
- shared_y = sub_group_broadcast(y.s4, 2); \
80
- total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
81
- total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
82
- shared_y = sub_group_broadcast(y.s5, 2); \
83
- total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
84
- total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
85
- shared_y = sub_group_broadcast(y.s6, 2); \
86
- total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
87
- total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
88
- shared_y = sub_group_broadcast(y.s7, 2); \
89
- total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
90
- total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
91
- shared_y = sub_group_broadcast(y.s0, 3); \
92
- total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
93
- total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
94
- shared_y = sub_group_broadcast(y.s1, 3); \
95
- total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
96
- total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
97
- shared_y = sub_group_broadcast(y.s2, 3); \
98
- total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
99
- total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
100
- shared_y = sub_group_broadcast(y.s3, 3); \
101
- total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
102
- total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
103
- shared_y = sub_group_broadcast(y.s4, 3); \
104
- total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
105
- total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
106
- shared_y = sub_group_broadcast(y.s5, 3); \
107
- total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
108
- total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
109
- shared_y = sub_group_broadcast(y.s6, 3); \
110
- total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
111
- total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
112
- shared_y = sub_group_broadcast(y.s7, 3); \
113
- total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
114
- total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
115
-
116
-
117
- #define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
118
- float8 shared_y; \
119
- shared_y = sub_group_broadcast(y, 0); \
120
- total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
121
- total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
122
- total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
123
- total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
124
- total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
125
- total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
126
- total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
127
- total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
128
- total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
129
- total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
130
- total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
131
- total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
132
- total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
133
- total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
134
- total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
135
- total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
136
- shared_y = sub_group_broadcast(y, 1); \
137
- total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
138
- total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
139
- total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
140
- total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
141
- total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
142
- total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
143
- total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
144
- total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
145
- total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
146
- total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
147
- total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
148
- total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
149
- total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
150
- total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
151
- total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
152
- total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
153
-
154
-
155
- #define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
156
- shared_y = sub_group_broadcast(y, 2); \
157
- total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
158
- total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
159
- total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
160
- total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
161
- total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
162
- total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
163
- total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
164
- total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
165
- total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
166
- total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
167
- total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
168
- total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
169
- total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
170
- total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
171
- total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
172
- total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
173
- shared_y = sub_group_broadcast(y, 3); \
174
- total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
175
- total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
176
- total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
177
- total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
178
- total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
179
- total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
180
- total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
181
- total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
182
- total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
183
- total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
184
- total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
185
- total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
186
- total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
187
- total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
188
- total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
189
- total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
190
-
191
- #ifdef ADRENO_GPU
192
- REQD_SUBGROUP_SIZE_64
193
- #endif
194
- __kernel void kernel_gemv_noshuffle(
195
- __read_only image1d_buffer_t src0_q, // quantized A
196
- global half2 * src0_d, // A scales
197
- __read_only image1d_buffer_t src1, // B
198
- ulong offset1, // offset to B (0)
199
- global float * dst, // C
200
- ulong offsetd, // offset to C (0)
201
- uint K, // K
202
- int ne01, // M
203
- int ne02, // 1
204
- int ne10, // K
205
- int ne12, // 1
206
- int ne0, // M
207
- int ne1, // N
208
- int r2, // 1
209
- int r3)
210
- {
211
- uint groupId = get_local_id(1);
212
- uint gid = get_global_id(0);
213
- ushort slid = get_sub_group_local_id();
214
-
215
- __private uint4 regA;
216
- __private half2 regS;
217
- __private float8 regB;
218
-
219
- __private float2 totalSum = (float2)(0.0f);
220
-
221
- // loop along K in block granularity, skip 4 blocks every iter
222
- for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
223
- regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
224
- // first 4 fibers in each wave load 8 B values to its private scope
225
- if (slid < 4) {
226
- regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
227
- regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
228
- }
229
-
230
- // load half weights for two blocks in consecutive rows
231
- regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
232
- regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
233
- regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
234
- regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
235
- #ifdef VECTOR_SUB_GROUP_BROADCAT
236
- dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
237
- #else
238
- dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
239
- #endif // VECTOR_SUB_GROUP_BROADCAT
240
-
241
- regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
242
- regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
243
- regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
244
- regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
245
- #ifdef VECTOR_SUB_GROUP_BROADCAT
246
- dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
247
- #else
248
- dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
249
- #endif // VECTOR_SUB_GROUP_BROADCAT
250
- }
251
-
252
- // reduction in local memory, assumes #wave=4
253
- __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
254
- if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
255
- if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
256
- if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
257
- barrier(CLK_LOCAL_MEM_FENCE);
258
- if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
259
- if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
260
- if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
261
-
262
- // 2 outputs per fiber in wave 0
263
- if (groupId == 0) {
264
- dst = (global float*)((global char*)dst + offsetd);
265
- vstore2(totalSum, 0, &(dst[gid * 2]));
266
- }
267
-
268
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl DELETED
@@ -1,274 +0,0 @@
1
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
- #pragma OPENCL EXTENSION cl_khr_subgroups : enable
3
-
4
- #ifdef cl_qcom_reqd_sub_group_size
5
- #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
6
- #define ADRENO_GPU 1
7
- #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
8
- #endif
9
-
10
- // assume
11
- #define QK4_0 32
12
- #define N_SIMDGROUP 4
13
-
14
- #define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
15
- float shared_y; \
16
- shared_y = sub_group_broadcast(y.s0, 0); \
17
- total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
18
- total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
19
- shared_y = sub_group_broadcast(y.s1, 0); \
20
- total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
21
- total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
22
- shared_y = sub_group_broadcast(y.s2, 0); \
23
- total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
24
- total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
25
- shared_y = sub_group_broadcast(y.s3, 0); \
26
- total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
27
- total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
28
- shared_y = sub_group_broadcast(y.s4, 0); \
29
- total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
30
- total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
31
- shared_y = sub_group_broadcast(y.s5, 0); \
32
- total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
33
- total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
34
- shared_y = sub_group_broadcast(y.s6, 0); \
35
- total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
36
- total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
37
- shared_y = sub_group_broadcast(y.s7, 0); \
38
- total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
39
- total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
40
- shared_y = sub_group_broadcast(y.s0, 1); \
41
- total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
42
- total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
43
- shared_y = sub_group_broadcast(y.s1, 1); \
44
- total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
45
- total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
46
- shared_y = sub_group_broadcast(y.s2, 1); \
47
- total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
48
- total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
49
- shared_y = sub_group_broadcast(y.s3, 1); \
50
- total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
51
- total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
52
- shared_y = sub_group_broadcast(y.s4, 1); \
53
- total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
54
- total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
55
- shared_y = sub_group_broadcast(y.s5, 1); \
56
- total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
57
- total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
58
- shared_y = sub_group_broadcast(y.s6, 1); \
59
- total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
60
- total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
61
- shared_y = sub_group_broadcast(y.s7, 1); \
62
- total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
63
- total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
64
-
65
-
66
- #define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
67
- shared_y = sub_group_broadcast(y.s0, 2); \
68
- total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
69
- total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
70
- shared_y = sub_group_broadcast(y.s1, 2); \
71
- total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
72
- total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
73
- shared_y = sub_group_broadcast(y.s2, 2); \
74
- total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
75
- total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
76
- shared_y = sub_group_broadcast(y.s3, 2); \
77
- total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
78
- total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
79
- shared_y = sub_group_broadcast(y.s4, 2); \
80
- total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
81
- total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
82
- shared_y = sub_group_broadcast(y.s5, 2); \
83
- total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
84
- total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
85
- shared_y = sub_group_broadcast(y.s6, 2); \
86
- total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
87
- total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
88
- shared_y = sub_group_broadcast(y.s7, 2); \
89
- total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
90
- total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
91
- shared_y = sub_group_broadcast(y.s0, 3); \
92
- total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
93
- total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
94
- shared_y = sub_group_broadcast(y.s1, 3); \
95
- total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
96
- total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
97
- shared_y = sub_group_broadcast(y.s2, 3); \
98
- total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
99
- total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
100
- shared_y = sub_group_broadcast(y.s3, 3); \
101
- total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
102
- total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
103
- shared_y = sub_group_broadcast(y.s4, 3); \
104
- total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
105
- total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
106
- shared_y = sub_group_broadcast(y.s5, 3); \
107
- total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
108
- total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
109
- shared_y = sub_group_broadcast(y.s6, 3); \
110
- total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
111
- total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
112
- shared_y = sub_group_broadcast(y.s7, 3); \
113
- total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
114
- total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
115
-
116
-
117
- #define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
118
- float8 shared_y; \
119
- shared_y = sub_group_broadcast(y, 0); \
120
- total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
121
- total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
122
- total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
123
- total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
124
- total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
125
- total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
126
- total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
127
- total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
128
- total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
129
- total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
130
- total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
131
- total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
132
- total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
133
- total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
134
- total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
135
- total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
136
- shared_y = sub_group_broadcast(y, 1); \
137
- total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
138
- total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
139
- total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
140
- total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
141
- total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
142
- total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
143
- total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
144
- total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
145
- total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
146
- total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
147
- total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
148
- total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
149
- total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
150
- total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
151
- total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
152
- total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
153
-
154
-
155
- #define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
156
- shared_y = sub_group_broadcast(y, 2); \
157
- total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
158
- total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
159
- total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
160
- total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
161
- total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
162
- total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
163
- total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
164
- total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
165
- total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
166
- total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
167
- total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
168
- total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
169
- total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
170
- total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
171
- total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
172
- total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
173
- shared_y = sub_group_broadcast(y, 3); \
174
- total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
175
- total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
176
- total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
177
- total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
178
- total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
179
- total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
180
- total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
181
- total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
182
- total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
183
- total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
184
- total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
185
- total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
186
- total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
187
- total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
188
- total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
189
- total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
190
-
191
- #ifdef ADRENO_GPU
192
- REQD_SUBGROUP_SIZE_64
193
- #endif
194
- __kernel void kernel_gemv_noshuffle(
195
- __read_only image1d_buffer_t src0_q, // quantized A
196
- global half2 * src0_d, // A scales
197
- __read_only image1d_buffer_t src1, // B
198
- ulong offset1, // offset to B (0)
199
- global float * dst, // C
200
- ulong offsetd, // offset to C (0)
201
- int ne00, // K
202
- int ne01, // M
203
- int ne02, // 1
204
- int ne10, // K
205
- int ne12, // 1
206
- int ne0, // M
207
- int ne1, // N
208
- int r2, // 1
209
- int r3)
210
- {
211
- uint groupId = get_local_id(1);
212
- uint gid = get_global_id(0);
213
- ushort slid = get_sub_group_local_id();
214
-
215
- uint K = ne00;
216
- uint M = ne01;
217
-
218
- uint LINE_STRIDE_A = M / 2;
219
- uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
220
-
221
- __private uint4 regA;
222
- __private half2 regS;
223
- __private float8 regB;
224
-
225
- __private float2 totalSum = (float2)(0.0f);
226
-
227
- // loop along K in block granularity, skip 4 blocks every iter
228
- for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
229
- regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
230
- // first 4 fibers in each wave load 8 B values to its private scope
231
- if (slid < 4) {
232
- regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
233
- regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
234
- }
235
-
236
- // load half weights for two blocks in consecutive rows
237
- regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
238
- regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
239
- regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
240
- regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
241
- #ifdef VECTOR_SUB_GROUP_BROADCAT
242
- dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
243
- #else
244
- dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
245
- #endif // VECTOR_SUB_GROUP_BROADCAT
246
-
247
- regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
248
- regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
249
- regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
250
- regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
251
- #ifdef VECTOR_SUB_GROUP_BROADCAT
252
- dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
253
- #else
254
- dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
255
- #endif // VECTOR_SUB_GROUP_BROADCAT
256
- }
257
-
258
- // reduction in local memory, assumes #wave=4
259
- __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
260
- if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
261
- if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
262
- if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
263
- barrier(CLK_LOCAL_MEM_FENCE);
264
- if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
265
- if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
266
- if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
267
-
268
- // 2 outputs per fiber in wave 0
269
- if (groupId == 0) {
270
- dst = (global float*)((global char*)dst + offsetd);
271
- vstore2(totalSum, 0, &(dst[gid * 2]));
272
- }
273
-
274
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-opencl/kernels/ggml-opencl_im2col.cl DELETED
@@ -1,146 +0,0 @@
1
- #ifdef cl_khr_fp16
2
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
3
- #elif defined(cl_amd_fp16)
4
- #pragma OPENCL EXTENSION cl_amd_fp16 : enable
5
- #else
6
- #error "Half precision floating point not supportedby OpenCL implementation on your device."
7
- #endif
8
-
9
- #ifdef cl_khr_subgroups
10
- #pragma OPENCL EXTENSION cl_khr_subgroups : enable
11
- #elif defined(cl_intel_subgroups)
12
- #pragma OPENCL EXTENSION cl_intel_subgroups : enable
13
- #else
14
- #error "Subgroup not supported on your device."
15
- #endif
16
-
17
- #ifdef cl_intel_required_subgroup_size
18
- // Always use subgroup size of 32 on Intel.
19
- #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
20
- #define INTEL_GPU 1
21
- #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
22
- #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
23
- #elif defined(cl_qcom_reqd_sub_group_size)
24
- // Always use subgroups size of 64 on Adreno.
25
- #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
26
- #define ADRENO_GPU 1
27
- #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
28
- #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
29
- #else
30
- // TODO: do not know how to choose subgroup size on other GPUs.
31
- #error "Selecting subgroup size is not supported on your device."
32
- #endif
33
-
34
- kernel void kernel_im2col_f32(
35
- global float * src1,
36
- ulong offset1,
37
- global float * dst,
38
- ulong offsetd,
39
- ulong batch_offset,
40
- ulong delta_offset,
41
- long IW,
42
- long IH,
43
- long IC,
44
- long OW,
45
- long OH,
46
- long KW,
47
- long KH,
48
- long pelements,
49
- long CHW,
50
- int s0,
51
- int s1,
52
- int p0,
53
- int p1,
54
- int d0,
55
- int d1
56
- ) {
57
- // threadIdx.x + blockIdx.x * blockDim.x
58
- long i = get_global_id(0);
59
- if (i >= pelements) {
60
- return;
61
- }
62
-
63
- src1 = (global float*)((global char*)src1 + offset1);
64
- dst = (global float*)((global char*)dst + offsetd);
65
-
66
- long ksize = OW * (KH > 1 ? KW : 1);
67
- long kx = i / ksize;
68
- long kd = kx * ksize;
69
- long ky = (i - kd) / OW;
70
- long ix = i % OW;
71
-
72
- long oh = get_group_id(1);
73
- long batch = get_group_id(2) / IC;
74
- long ic = get_group_id(2) % IC;
75
-
76
- long iiw = ix * s0 + kx * d0 - p0;
77
- long iih = oh * s1 + ky * d1 - p1;
78
-
79
- long offset_dst =
80
- ((batch * OH + oh) * OW + ix) * CHW +
81
- (ic * (KW * KH) + ky * KW + kx);
82
-
83
- if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
84
- dst[offset_dst] = 0.0f;
85
- } else {
86
- long offset_src = ic * delta_offset + batch * batch_offset;
87
- dst[offset_dst] = src1[offset_src + iih * IW + iiw];
88
- }
89
- }
90
-
91
- kernel void kernel_im2col_f16(
92
- global float * src1,
93
- ulong offset1,
94
- global half * dst,
95
- ulong offsetd,
96
- ulong batch_offset,
97
- ulong delta_offset,
98
- long IW,
99
- long IH,
100
- long IC,
101
- long OW,
102
- long OH,
103
- long KW,
104
- long KH,
105
- long pelements,
106
- long CHW,
107
- int s0,
108
- int s1,
109
- int p0,
110
- int p1,
111
- int d0,
112
- int d1
113
- ) {
114
- long i = get_global_id(0);
115
-
116
- if (i >= pelements) {
117
- return;
118
- }
119
-
120
- src1 = (global float*)((global char*)src1 + offset1);
121
- dst = (global half*)((global char*)dst + offsetd);
122
-
123
- long ksize = OW * (KH > 1 ? KW : 1);
124
- long kx = i / ksize;
125
- long kd = kx * ksize;
126
- long ky = (i - kd) / OW;
127
- long ix = i % OW;
128
-
129
- long oh = get_group_id(1);
130
- long batch = get_group_id(2) / IC;
131
- long ic = get_group_id(2) % IC;
132
-
133
- long iiw = ix * s0 + kx * d0 - p0;
134
- long iih = oh * s1 + ky * d1 - p1;
135
-
136
- long offset_dst =
137
- ((batch * OH + oh) * OW + ix) * CHW +
138
- (ic * (KW * KH) + ky * KW + kx);
139
-
140
- if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
141
- dst[offset_dst] = 0.0f;
142
- } else {
143
- long offset_src = ic * delta_offset + batch * batch_offset;
144
- dst[offset_dst] = src1[offset_src + iih * IW + iiw];
145
- }
146
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl DELETED
@@ -1,1225 +0,0 @@
1
- //------------------------------------------------------------------------------
2
- // This file is contains additional mulmat kernels
3
- // (and potentially other kernels).
4
- //------------------------------------------------------------------------------
5
- #ifdef cl_khr_fp16
6
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
7
- #elif defined(cl_amd_fp16)
8
- #pragma OPENCL EXTENSION cl_amd_fp16 : enable
9
- #else
10
- #error "Half precision floating point not supportedby OpenCL implementation on your device."
11
- #endif
12
-
13
- #ifdef cl_khr_subgroups
14
- #pragma OPENCL EXTENSION cl_khr_subgroups : enable
15
- #elif defined(cl_intel_subgroups)
16
- #pragma OPENCL EXTENSION cl_intel_subgroups : enable
17
- #else
18
- #error "Subgroup not supported on your device."
19
- #endif
20
-
21
- #ifdef cl_intel_required_subgroup_size
22
- // Always use subgroup size of 32 on Intel.
23
- #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
24
- #define INTEL_GPU 1
25
- #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
26
- #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
27
- #elif defined(cl_qcom_reqd_sub_group_size)
28
- // Always use subgroups size of 64 on Adreno.
29
- #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
30
- #define ADRENO_GPU 1
31
- #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
32
- #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
33
- #else
34
- // TODO: do not know how to choose subgroup size on other GPUs.
35
- #error "Selecting subgroup size is not supported on your device."
36
- #endif
37
-
38
- #define QK4_0 32
39
- #define QR4_0 2
40
- #define QK4_1 32
41
- #define QR4_1 2
42
- #define QK5_0 32
43
- #define QR5_0 2
44
- #define QK5_1 32
45
- #define QR5_1 2
46
- #define QK8_0 32
47
- #define QR8_0 1
48
- #define QK_K 256
49
- #define K_QUANTS_PER_ITERATION 2
50
-
51
- typedef char int8_t;
52
- typedef uchar uint8_t;
53
- typedef short int16_t;
54
- typedef ushort uint16_t;
55
- typedef int int32_t;
56
- typedef uint uint32_t;
57
-
58
- //------------------------------------------------------------------------------
59
- // block_q4_0
60
- //------------------------------------------------------------------------------
61
- struct block_q4_0
62
- {
63
- half d;
64
- uint8_t qs[QK4_0 / 2];
65
- };
66
-
67
- //------------------------------------------------------------------------------
68
- // block_q6_K
69
- //------------------------------------------------------------------------------
70
- // 6-bit quantization
71
- // weight is represented as x = a * q
72
- // 16 blocks of 16 elements each
73
- // Effectively 6.5625 bits per weight
74
- typedef struct {
75
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
76
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
77
- int8_t scales[QK_K/16]; // scales, quantized with 8 bits
78
- half d; // super-block scale
79
- } block_q6_K;
80
-
81
- //------------------------------------------------------------------------------
82
- // These are the variant for matmatmul, based on the matvecmul kernel with
83
- // flattened block_q4_0.
84
- //------------------------------------------------------------------------------
85
-
86
- // Common dot prod.
87
- inline float mm_block_q_4_0_dot_y_flat(
88
- global uchar * x,
89
- global half * dh,
90
- float sumy,
91
- float16 yl,
92
- int il
93
- ) {
94
- float d = *dh;
95
- global ushort * qs = ((global ushort *)x + il/2);
96
- float acc = 0.f;
97
-
98
- acc += yl.s0 * (qs[0] & 0x000F);
99
- acc += yl.s1 * (qs[0] & 0x0F00);
100
- acc += yl.s8 * (qs[0] & 0x00F0);
101
- acc += yl.s9 * (qs[0] & 0xF000);
102
-
103
- acc += yl.s2 * (qs[1] & 0x000F);
104
- acc += yl.s3 * (qs[1] & 0x0F00);
105
- acc += yl.sa * (qs[1] & 0x00F0);
106
- acc += yl.sb * (qs[1] & 0xF000);
107
-
108
- acc += yl.s4 * (qs[2] & 0x000F);
109
- acc += yl.s5 * (qs[2] & 0x0F00);
110
- acc += yl.sc * (qs[2] & 0x00F0);
111
- acc += yl.sd * (qs[2] & 0xF000);
112
-
113
- acc += yl.s6 * (qs[3] & 0x000F);
114
- acc += yl.s7 * (qs[3] & 0x0F00);
115
- acc += yl.se * (qs[3] & 0x00F0);
116
- acc += yl.sf * (qs[3] & 0xF000);
117
-
118
- return d * (sumy * -8.f + acc);
119
- }
120
-
121
- #undef N_DST
122
- #undef N_SIMDGROUP
123
- #undef N_SIMDWIDTH
124
-
125
- #ifdef INTEL_GPU
126
- #define N_DST 8 // each SIMD group works on 8 rows (in weights matrix)
127
- #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
128
- #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
129
- #elif defined (ADRENO_GPU)
130
- #define N_DST 8
131
- #define N_SIMDGROUP 1
132
- #define N_SIMDWIDTH 64
133
- #endif
134
- //
135
- // This variant performs 1d blocking with 8x output.
136
- // Eeach simdgroup outputs 8 values on `n0` dim (row in the output matrix).
137
- //
138
- inline void mul_mat_q_n_f32_1d_8x_flat(
139
- global uchar * src0_q,
140
- global half * src0_d,
141
- global float * src1,
142
- global float * dst,
143
- int ne00,
144
- int ne01,
145
- int ne02,
146
- int ne10,
147
- int ne12,
148
- int ne0,
149
- int ne1,
150
- int r2,
151
- int r3
152
- ) {
153
- const int nb = ne00/QK4_0;
154
-
155
- int r0 = get_group_id(0);
156
- int r1 = get_group_id(1);
157
- int im = get_group_id(2);
158
-
159
- // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
160
- // a SIMD group in the grid. Each SIMD group produces N_DST values in the
161
- // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
162
- // Currently with llama2 7B, im is always 0.
163
- // TODO: how to handle im/gqa*(nb*ne0)?
164
- int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
165
-
166
- int i12 = im%ne12;
167
- int i13 = im/ne12;
168
-
169
- // The number of scales is the same as the number of blocks.
170
- ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
171
- // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
172
- ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
173
-
174
- global uchar * x = (global uchar *) src0_q + offset0_q;
175
- global half * d = (global half *) src0_d + offset0_d;
176
- global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
177
-
178
- float16 yl;
179
- float8 sumf = (float8)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
180
-
181
- int ix = get_sub_group_local_id()/2;
182
- int il = 8*(get_sub_group_local_id()%2);
183
-
184
- global float * yb = y + ix*QK4_0 + il;
185
-
186
- for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
187
- float sumy = 0.f;
188
-
189
- sumy += yb[0];
190
- sumy += yb[1];
191
- sumy += yb[2];
192
- sumy += yb[3];
193
- sumy += yb[4];
194
- sumy += yb[5];
195
- sumy += yb[6];
196
- sumy += yb[7];
197
-
198
- sumy += yb[16];
199
- sumy += yb[17];
200
- sumy += yb[18];
201
- sumy += yb[19];
202
- sumy += yb[20];
203
- sumy += yb[21];
204
- sumy += yb[22];
205
- sumy += yb[23];
206
-
207
- yl.s0 = yb[0];
208
- yl.s1 = yb[1]/256.f;
209
-
210
- yl.s2 = yb[2];
211
- yl.s3 = yb[3]/256.f;
212
-
213
- yl.s4 = yb[4];
214
- yl.s5 = yb[5]/256.f;
215
-
216
- yl.s6 = yb[6];
217
- yl.s7 = yb[7]/256.f;
218
-
219
- yl.s8 = yb[16]/16.f;
220
- yl.s9 = yb[17]/4096.f;
221
-
222
- yl.sa = yb[18]/16.f;
223
- yl.sb = yb[19]/4096.f;
224
-
225
- yl.sc = yb[20]/16.f;
226
- yl.sd = yb[21]/4096.f;
227
-
228
- yl.se = yb[22]/16.f;
229
- yl.sf = yb[23]/4096.f;
230
-
231
- sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
232
- sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
233
- sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
234
- sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
235
-
236
- sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
237
- sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
238
- sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
239
- sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
240
-
241
- yb += QK4_0 * (N_SIMDWIDTH/2);
242
- }
243
-
244
- float8 tot = (float8)(
245
- sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
246
- sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
247
- sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
248
- sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
249
- );
250
-
251
- if (get_sub_group_local_id() == 0) {
252
- if (first_row + 0 < ne01) {
253
- dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
254
- }
255
- if (first_row + 1 < ne01) {
256
- dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
257
- }
258
- if (first_row + 2 < ne01) {
259
- dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
260
- }
261
- if (first_row + 3 < ne01) {
262
- dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
263
- }
264
-
265
- if (first_row + 4 < ne01) {
266
- dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
267
- }
268
- if (first_row + 5 < ne01) {
269
- dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
270
- }
271
- if (first_row + 6 < ne01) {
272
- dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
273
- }
274
- if (first_row + 7 < ne01) {
275
- dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
276
- }
277
- }
278
- }
279
-
280
- #ifdef INTEL_GPU
281
- REQD_SUBGROUP_SIZE_16
282
- #elif defined (ADRENO_GPU)
283
- REQD_SUBGROUP_SIZE_64
284
- #endif
285
- kernel void kernel_mul_mat_q4_0_f32_1d_8x_flat(
286
- global uchar * src0_q,
287
- global half * src0_d,
288
- global float * src1,
289
- ulong offset1,
290
- global float * dst,
291
- ulong offsetd,
292
- int ne00,
293
- int ne01,
294
- int ne02,
295
- int ne10,
296
- int ne12,
297
- int ne0,
298
- int ne1,
299
- int r2,
300
- int r3
301
- ) {
302
- src1 = (global float*)((global char*)src1 + offset1);
303
- dst = (global float*)((global char*)dst + offsetd);
304
-
305
- mul_mat_q_n_f32_1d_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
306
- }
307
-
308
- #undef N_DST
309
- #undef N_SIMDGROUP
310
- #undef N_SIMDWIDTH
311
-
312
- #ifdef INTEL_GPU
313
- #define N_DST 16 // each SIMD group works on 8 rows (in weights matrix)
314
- #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
315
- #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
316
- #elif defined (ADRENO_GPU)
317
- #define N_DST 16
318
- #define N_SIMDGROUP 1
319
- #define N_SIMDWIDTH 64
320
- #endif
321
- //
322
- // This variant performs 1d blocking with 16x output.
323
- // Eeach simdgroup outputs 16 values on `n0` dim (row in the output matrix).
324
- //
325
- inline void mul_mat_q_n_f32_1d_16x_flat(
326
- global uchar * src0_q,
327
- global half * src0_d,
328
- global float * src1,
329
- global float * dst,
330
- int ne00,
331
- int ne01,
332
- int ne02,
333
- int ne10,
334
- int ne12,
335
- int ne0,
336
- int ne1,
337
- int r2,
338
- int r3
339
- ) {
340
- const int nb = ne00/QK4_0;
341
-
342
- int r0 = get_group_id(0);
343
- int r1 = get_group_id(1);
344
- int im = get_group_id(2);
345
-
346
- // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
347
- // a SIMD group in the grid. Each SIMD group produces N_DST values in the
348
- // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
349
- // Currently with llama2 7B, im is always 0.
350
- // TODO: how to handle im/gqa*(nb*ne0)?
351
- int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
352
-
353
- int i12 = im%ne12;
354
- int i13 = im/ne12;
355
-
356
- // The number of scales is the same as the number of blocks.
357
- ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
358
- // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
359
- ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
360
-
361
- global uchar * x = (global uchar *) src0_q + offset0_q;
362
- global half * d = (global half *) src0_d + offset0_d;
363
- global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
364
-
365
- float16 yl;
366
- float16 sumf = (float16)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
367
- 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
368
-
369
- int ix = get_sub_group_local_id()/2;
370
- int il = 8*(get_sub_group_local_id()%2);
371
-
372
- global float * yb = y + ix*QK4_0 + il;
373
-
374
- for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
375
- float sumy = 0.f;
376
-
377
- sumy += yb[0];
378
- sumy += yb[1];
379
- sumy += yb[2];
380
- sumy += yb[3];
381
- sumy += yb[4];
382
- sumy += yb[5];
383
- sumy += yb[6];
384
- sumy += yb[7];
385
-
386
- sumy += yb[16];
387
- sumy += yb[17];
388
- sumy += yb[18];
389
- sumy += yb[19];
390
- sumy += yb[20];
391
- sumy += yb[21];
392
- sumy += yb[22];
393
- sumy += yb[23];
394
-
395
- yl.s0 = yb[0];
396
- yl.s1 = yb[1]/256.f;
397
-
398
- yl.s2 = yb[2];
399
- yl.s3 = yb[3]/256.f;
400
-
401
- yl.s4 = yb[4];
402
- yl.s5 = yb[5]/256.f;
403
-
404
- yl.s6 = yb[6];
405
- yl.s7 = yb[7]/256.f;
406
-
407
- yl.s8 = yb[16]/16.f;
408
- yl.s9 = yb[17]/4096.f;
409
-
410
- yl.sa = yb[18]/16.f;
411
- yl.sb = yb[19]/4096.f;
412
-
413
- yl.sc = yb[20]/16.f;
414
- yl.sd = yb[21]/4096.f;
415
-
416
- yl.se = yb[22]/16.f;
417
- yl.sf = yb[23]/4096.f;
418
-
419
- sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
420
- sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
421
- sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
422
- sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
423
-
424
- sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
425
- sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
426
- sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
427
- sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
428
-
429
- sumf.s8 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 8*nb*QK4_0/2, d + ib + 8*nb, sumy, yl, il);
430
- sumf.s9 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 9*nb*QK4_0/2, d + ib + 9*nb, sumy, yl, il);
431
- sumf.sa += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 10*nb*QK4_0/2, d + ib + 10*nb, sumy, yl, il);
432
- sumf.sb += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 11*nb*QK4_0/2, d + ib + 11*nb, sumy, yl, il);
433
-
434
- sumf.sc += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 12*nb*QK4_0/2, d + ib + 12*nb, sumy, yl, il);
435
- sumf.sd += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 13*nb*QK4_0/2, d + ib + 13*nb, sumy, yl, il);
436
- sumf.se += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 14*nb*QK4_0/2, d + ib + 14*nb, sumy, yl, il);
437
- sumf.sf += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 15*nb*QK4_0/2, d + ib + 15*nb, sumy, yl, il);
438
-
439
- yb += QK4_0 * (N_SIMDWIDTH/2);
440
- }
441
-
442
- float16 tot = (float16)(
443
- sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
444
- sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
445
- sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
446
- sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7),
447
-
448
- sub_group_reduce_add(sumf.s8), sub_group_reduce_add(sumf.s9),
449
- sub_group_reduce_add(sumf.sa), sub_group_reduce_add(sumf.sb),
450
- sub_group_reduce_add(sumf.sc), sub_group_reduce_add(sumf.sd),
451
- sub_group_reduce_add(sumf.se), sub_group_reduce_add(sumf.sf)
452
- );
453
-
454
- if (get_sub_group_local_id() == 0) {
455
- if (first_row + 0 < ne01) {
456
- dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
457
- }
458
- if (first_row + 1 < ne01) {
459
- dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
460
- }
461
- if (first_row + 2 < ne01) {
462
- dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
463
- }
464
- if (first_row + 3 < ne01) {
465
- dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
466
- }
467
-
468
- if (first_row + 4 < ne01) {
469
- dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
470
- }
471
- if (first_row + 5 < ne01) {
472
- dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
473
- }
474
- if (first_row + 6 < ne01) {
475
- dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
476
- }
477
- if (first_row + 7 < ne01) {
478
- dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
479
- }
480
-
481
- if (first_row + 8 < ne01) {
482
- dst[r1*ne0 + im*ne0*ne1 + first_row + 8] = tot.s8;
483
- }
484
- if (first_row + 9 < ne01) {
485
- dst[r1*ne0 + im*ne0*ne1 + first_row + 9] = tot.s9;
486
- }
487
- if (first_row + 10 < ne01) {
488
- dst[r1*ne0 + im*ne0*ne1 + first_row + 10] = tot.sa;
489
- }
490
- if (first_row + 11 < ne01) {
491
- dst[r1*ne0 + im*ne0*ne1 + first_row + 11] = tot.sb;
492
- }
493
-
494
- if (first_row + 12 < ne01) {
495
- dst[r1*ne0 + im*ne0*ne1 + first_row + 12] = tot.sc;
496
- }
497
- if (first_row + 13 < ne01) {
498
- dst[r1*ne0 + im*ne0*ne1 + first_row + 13] = tot.sd;
499
- }
500
- if (first_row + 14 < ne01) {
501
- dst[r1*ne0 + im*ne0*ne1 + first_row + 14] = tot.se;
502
- }
503
- if (first_row + 15 < ne01) {
504
- dst[r1*ne0 + im*ne0*ne1 + first_row + 15] = tot.sf;
505
- }
506
- }
507
- }
508
-
509
- #ifdef INTEL_GPU
510
- REQD_SUBGROUP_SIZE_16
511
- #elif defined (ADRENO_GPU)
512
- REQD_SUBGROUP_SIZE_64
513
- #endif
514
- kernel void kernel_mul_mat_q4_0_f32_1d_16x_flat(
515
- global uchar * src0_q,
516
- global half * src0_d,
517
- global float * src1,
518
- ulong offset1,
519
- global float * dst,
520
- ulong offsetd,
521
- int ne00,
522
- int ne01,
523
- int ne02,
524
- int ne10,
525
- int ne12,
526
- int ne0,
527
- int ne1,
528
- int r2,
529
- int r3
530
- ) {
531
- src1 = (global float*)((global char*)src1 + offset1);
532
- dst = (global float*)((global char*)dst + offsetd);
533
-
534
- mul_mat_q_n_f32_1d_16x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
535
- }
536
-
537
- //------------------------------------------------------------------------------
538
- // kernel_mul_mat_q4_0_f32_flat_v0
539
- //------------------------------------------------------------------------------
540
- inline float block_q_4_0_dot_y_flat_v2(
541
- half x,
542
- half d,
543
- float sumy,
544
- float4 yl
545
- ) {
546
- uchar2 q = as_uchar2(x);
547
- float acc = 0.0f;
548
-
549
- acc += (q.s0 & 0x0F) * yl.s0;
550
- acc += (q.s1 & 0x0F) * yl.s1;
551
-
552
- acc += (q.s0 & 0xF0) * yl.s2;
553
- acc += (q.s1 & 0xF0) * yl.s3;
554
-
555
- return d * (sumy * -8.f + acc);;
556
- }
557
-
558
- inline float block_q_4_0_dot_y_flat_v4(
559
- float x,
560
- half d,
561
- float sumy,
562
- float8 yl
563
- ) {
564
- uchar4 q = as_uchar4(x);
565
- float acc = 0.0f;
566
-
567
- acc += (q.s0 & 0x0F) * yl.s0;
568
- acc += (q.s1 & 0x0F) * yl.s1;
569
- acc += (q.s2 & 0x0F) * yl.s2;
570
- acc += (q.s3 & 0x0F) * yl.s3;
571
-
572
- acc += (q.s0 & 0xF0) * yl.s4;
573
- acc += (q.s1 & 0xF0) * yl.s5;
574
- acc += (q.s2 & 0xF0) * yl.s6;
575
- acc += (q.s3 & 0xF0) * yl.s7;
576
-
577
- return d * (sumy * -8.f + acc);;
578
- }
579
-
580
- inline float block_q_4_0_dot_y_flat_v8(
581
- float2 x,
582
- half d,
583
- float sumy,
584
- float16 yl
585
- ) {
586
- uchar8 q = as_uchar8(x);
587
- float acc = 0.0f;
588
-
589
- acc += (q.s0 & 0x0F) * yl.s0;
590
- acc += (q.s1 & 0x0F) * yl.s1;
591
- acc += (q.s2 & 0x0F) * yl.s2;
592
- acc += (q.s3 & 0x0F) * yl.s3;
593
- acc += (q.s4 & 0x0F) * yl.s4;
594
- acc += (q.s5 & 0x0F) * yl.s5;
595
- acc += (q.s6 & 0x0F) * yl.s6;
596
- acc += (q.s7 & 0x0F) * yl.s7;
597
-
598
- acc += (q.s0 & 0xF0) * yl.s8;
599
- acc += (q.s1 & 0xF0) * yl.s9;
600
- acc += (q.s2 & 0xF0) * yl.sa;
601
- acc += (q.s3 & 0xF0) * yl.sb;
602
- acc += (q.s4 & 0xF0) * yl.sc;
603
- acc += (q.s5 & 0xF0) * yl.sd;
604
- acc += (q.s6 & 0xF0) * yl.se;
605
- acc += (q.s7 & 0xF0) * yl.sf;
606
-
607
- return d * (sumy * -8.f + acc);;
608
- }
609
-
610
- #undef N_DST
611
- #undef N_SIMDGROUP
612
- #undef N_SIMDWIDTH
613
-
614
- #ifdef INTEL_GPU
615
- #define THREADS_PER_BLK 4 // Number of threads per block, or each thread process 1/THREADS_PER_BLK of a block
616
- #define N_DST 4
617
- #define N_SIMDGROUP 1
618
- #define N_SIMDWIDTH 16
619
- #elif defined (ADRENO_GPU)
620
- #define THREADS_PER_BLK 4
621
- #define N_DST 4
622
- #define N_SIMDGROUP 1
623
- #define N_SIMDWIDTH 64
624
- #endif
625
-
626
- #if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
627
- # define ACT_TY float16
628
- # define Q_BLK_LD_TY float2
629
- # define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v8
630
- #elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
631
- # define ACT_TY float8
632
- # define Q_BLK_LD_TY float
633
- # define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v4
634
- #elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
635
- # define ACT_TY float4
636
- # define Q_BLK_LD_TY half
637
- # define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v2
638
- #endif
639
-
640
- #define BTYES_PER_THREAD_IN_BLK (QK4_0/2/THREADS_PER_BLK)
641
-
642
- #if N_DST == 2
643
- # define SUM_TY float2
644
- #elif N_DST == 4
645
- # define SUM_TY float4
646
- #elif N_DST == 8
647
- # define SUM_TY float8
648
- #elif N_DST == 16
649
- # define SUM_TY float16
650
- #endif
651
-
652
- #ifdef INTEL_GPU
653
- REQD_SUBGROUP_SIZE_16
654
- #elif defined (ADRENO_GPU)
655
- REQD_SUBGROUP_SIZE_64
656
- #endif
657
- kernel void kernel_mul_mat_q4_0_f32_flat_v0(
658
- global uchar * src0_q,
659
- global half * src0_d,
660
- global float * src1,
661
- ulong offset1,
662
- global float * dst,
663
- ulong offsetd,
664
- int ne00,
665
- int ne01,
666
- int ne02,
667
- int ne10,
668
- int ne12,
669
- int ne0,
670
- int ne1,
671
- int r2,
672
- int r3
673
- ) {
674
- src1 = (global float*)((global char*)src1 + offset1);
675
- dst = (global float*)((global char*)dst + offsetd);
676
-
677
- const int nb = ne00/QK4_0;
678
-
679
- int r0 = get_group_id(0);
680
- int r1 = get_group_id(1);
681
- int im = get_group_id(2);
682
-
683
- int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
684
-
685
- int i12 = im%ne12;
686
- int i13 = im/ne12;
687
-
688
- // The number of scales is the same as the number of blocks.
689
- ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
690
- // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
691
- ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
692
-
693
- global uchar * x = (global uchar *) src0_q + offset0_q;
694
- global half * d = (global half *) src0_d + offset0_d;
695
- global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
696
-
697
- int ix = get_sub_group_local_id()/THREADS_PER_BLK;
698
- int il = get_sub_group_local_id()%THREADS_PER_BLK;
699
-
700
- global float * yb = y + ix*QK4_0 + BTYES_PER_THREAD_IN_BLK*il;
701
-
702
- // Registers for caching activation
703
- ACT_TY yl = 0.f;
704
-
705
- // Registers for caching quants
706
- Q_BLK_LD_TY q_blk_0 = 0, q_blk_1 = 0;
707
- #if N_DST == 4 || N_DST == 8 || N_DST == 16
708
- Q_BLK_LD_TY q_blk_2 = 0, q_blk_3 = 0;
709
- #endif
710
- #if N_DST == 8 || N_DST == 16
711
- Q_BLK_LD_TY q_blk_4 = 0, q_blk_5 = 0, q_blk_6 = 0, q_blk_7 = 0;
712
- #endif
713
-
714
- // Partial sum
715
- SUM_TY sumf = 0.f;
716
-
717
- for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/THREADS_PER_BLK) {
718
- float sumy = 0.f;
719
-
720
- q_blk_0 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 0*nb*QK4_0/2);
721
- q_blk_1 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 1*nb*QK4_0/2);
722
- #if N_DST == 4 || N_DST == 8 || N_DST == 16
723
- q_blk_2 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 2*nb*QK4_0/2);
724
- q_blk_3 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 3*nb*QK4_0/2);
725
- #endif
726
- #if N_DST == 8 || N_DST == 16
727
- q_blk_4 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 4*nb*QK4_0/2));
728
- q_blk_5 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 5*nb*QK4_0/2));
729
- q_blk_6 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 6*nb*QK4_0/2));
730
- q_blk_7 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 7*nb*QK4_0/2));
731
- #endif
732
-
733
- // Load activation
734
- #if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
735
- yl.s01234567 = *(global float8 *)(yb);
736
- yl.s89abcdef = *(global float8 *)(yb + 16);
737
-
738
- sumy += yl.s0;
739
- sumy += yl.s1;
740
- sumy += yl.s2;
741
- sumy += yl.s3;
742
- sumy += yl.s4;
743
- sumy += yl.s5;
744
- sumy += yl.s6;
745
- sumy += yl.s7;
746
- sumy += yl.s8; yl.s8 /= 16.f;
747
- sumy += yl.s9; yl.s9 /= 16.f;
748
- sumy += yl.sa; yl.sa /= 16.f;
749
- sumy += yl.sb; yl.sb /= 16.f;
750
- sumy += yl.sc; yl.sc /= 16.f;
751
- sumy += yl.sd; yl.sd /= 16.f;
752
- sumy += yl.se; yl.se /= 16.f;
753
- sumy += yl.sf; yl.sf /= 16.f;
754
- #elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
755
- yl.s0123 = *(global float4 *)(yb);
756
- yl.s4567 = *(global float4 *)(yb + 16);
757
-
758
- sumy += yl.s0;
759
- sumy += yl.s1;
760
- sumy += yl.s2;
761
- sumy += yl.s3;
762
- sumy += yl.s4; yl.s4 /= 16.f;
763
- sumy += yl.s5; yl.s5 /= 16.f;
764
- sumy += yl.s6; yl.s6 /= 16.f;
765
- sumy += yl.s7; yl.s7 /= 16.f;
766
- #elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
767
- yl.s01 = *(global float2 *)(yb);
768
- yl.s23 = *(global float2 *)(yb + 16);
769
-
770
- sumy += yl.s0;
771
- sumy += yl.s1;
772
- sumy += yl.s2; yl.s2 /= 16.f;
773
- sumy += yl.s3; yl.s3 /= 16.f;
774
- #endif
775
-
776
- sumf.s0 += block_q_4_0_dot_y_flat(q_blk_0, *(d + ib + 0*nb), sumy, yl);
777
- sumf.s1 += block_q_4_0_dot_y_flat(q_blk_1, *(d + ib + 1*nb), sumy, yl);
778
- #if N_DST == 4 || N_DST == 8 || N_DST == 16
779
- sumf.s2 += block_q_4_0_dot_y_flat(q_blk_2, *(d + ib + 2*nb), sumy, yl);
780
- sumf.s3 += block_q_4_0_dot_y_flat(q_blk_3, *(d + ib + 3*nb), sumy, yl);
781
- #endif
782
- #if N_DST == 8 || N_DST == 16
783
- sumf.s4 += block_q_4_0_dot_y_flat(q_blk_4, *(d + ib + 4*nb), sumy, yl);
784
- sumf.s5 += block_q_4_0_dot_y_flat(q_blk_5, *(d + ib + 5*nb), sumy, yl);
785
- sumf.s6 += block_q_4_0_dot_y_flat(q_blk_6, *(d + ib + 6*nb), sumy, yl);
786
- sumf.s7 += block_q_4_0_dot_y_flat(q_blk_7, *(d + ib + 7*nb), sumy, yl);
787
- #endif
788
-
789
- yb += QK4_0 * (N_SIMDWIDTH/THREADS_PER_BLK);
790
- }
791
-
792
- SUM_TY tot = (SUM_TY)(
793
- sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1)
794
- #if N_DST == 4 || N_DST == 8 || N_DST == 16
795
- , sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
796
- #endif
797
- #if N_DST == 8 || N_DST == 16
798
- , sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5)
799
- , sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
800
- #endif
801
- );
802
-
803
- if (get_sub_group_local_id() == 0) {
804
- if (first_row + 0 < ne01) {
805
- dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
806
- }
807
- if (first_row + 1 < ne01) {
808
- dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
809
- }
810
- #if N_DST == 4 || N_DST == 8 || N_DST == 16
811
- if (first_row + 2 < ne01) {
812
- dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
813
- }
814
- if (first_row + 3 < ne01) {
815
- dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
816
- }
817
- #endif
818
- #if N_DST == 8 || N_DST == 16
819
- if (first_row + 4 < ne01) {
820
- dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
821
- }
822
- if (first_row + 5 < ne01) {
823
- dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
824
- }
825
- if (first_row + 6 < ne01) {
826
- dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
827
- }
828
- if (first_row + 7 < ne01) {
829
- dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
830
- }
831
- #endif
832
- }
833
- }
834
-
835
- //------------------------------------------------------------------------------
836
- // Using image1d_buffer_t
837
-
838
- #if defined(cl_qcom_subgroup_shuffle)
839
- #pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
840
- float qcom_sub_group_reduce_add(float sum) {
841
- sum += qcom_sub_group_shuffle_down(sum, 32, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
842
- sum += qcom_sub_group_shuffle_down(sum, 16, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
843
- sum += qcom_sub_group_shuffle_down(sum, 8, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
844
- sum += qcom_sub_group_shuffle_down(sum, 4, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
845
- sum += qcom_sub_group_shuffle_down(sum, 2, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
846
- sum += qcom_sub_group_shuffle_down(sum, 1, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
847
- return sum;
848
- }
849
- #define sub_group_reduce_add qcom_sub_group_reduce_add
850
- #else
851
- #define sub_group_reduce_add sub_group_reduce_add
852
- #endif
853
-
854
- #undef THREADS_PER_BLK
855
- #undef N_DST
856
- #undef N_SIMDGROUP
857
- #undef N_SIMDWIDTH
858
-
859
- #ifdef INTEL_GPU
860
- #define THREADS_PER_BLK 4 // Number of threads per block, or each thread process 1/THREADS_PER_BLK of a block
861
- #define N_DST 4
862
- #define N_SIMDGROUP 1
863
- #define N_SIMDWIDTH 16
864
- #elif defined (ADRENO_GPU)
865
- #define THREADS_PER_BLK 4
866
- #define N_DST 4
867
- #define N_SIMDGROUP 1
868
- #define N_SIMDWIDTH 64
869
- #endif
870
-
871
- #if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
872
- # define ACT_TY float16
873
- # define Q_BLK_LD_TY float2
874
- # define EXTRACT_BLK_DATA(tmp, part) *((float2*)&tmp + part)
875
- # define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v8
876
- #elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
877
- # define ACT_TY float8
878
- # define Q_BLK_LD_TY float
879
- # define EXTRACT_BLK_DATA(tmp, part) *((float*)&tmp + part)
880
- # define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v4
881
- #elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
882
- # define ACT_TY float4
883
- # define Q_BLK_LD_TY half
884
- # define EXTRACT_BLK_DATA(tmp, part) *((half*)&tmp + part)
885
- # define block_q_4_0_dot_y_flat block_q_4_0_dot_y_flat_v2
886
- #endif
887
-
888
- #define BTYES_PER_THREAD_IN_BLK (QK4_0/2/THREADS_PER_BLK)
889
-
890
- #if N_DST == 2
891
- # define SUM_TY float2
892
- #elif N_DST == 4
893
- # define SUM_TY float4
894
- #elif N_DST == 8
895
- # define SUM_TY float8
896
- #elif N_DST == 16
897
- # define SUM_TY float16
898
- #endif
899
-
900
- #ifdef INTEL_GPU
901
- REQD_SUBGROUP_SIZE_16
902
- #elif defined (ADRENO_GPU)
903
- REQD_SUBGROUP_SIZE_64
904
- #endif
905
- kernel void kernel_mul_mat_q4_0_f32_flat_img_v0(
906
- read_only image1d_buffer_t src0_q,
907
- read_only image1d_buffer_t src0_d,
908
- global float * src1,
909
- ulong offset1,
910
- global float * dst,
911
- ulong offsetd,
912
- int ne00,
913
- int ne01,
914
- int ne02,
915
- int ne10,
916
- int ne12,
917
- int ne0,
918
- int ne1,
919
- int r2,
920
- int r3
921
- ) {
922
- src1 = (global float*)((global char*)src1 + offset1);
923
- dst = (global float*)((global char*)dst + offsetd);
924
-
925
- const int nb = ne00/QK4_0;
926
-
927
- int r0 = get_group_id(0);
928
- int r1 = get_group_id(1);
929
- int im = get_group_id(2);
930
-
931
- int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
932
-
933
- int i12 = im%ne12;
934
- int i13 = im/ne12;
935
-
936
- // The number of scales is the same as the number of blocks.
937
- ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
938
- // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
939
- ulong offset0_q = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
940
-
941
- global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
942
-
943
- int ix = get_sub_group_local_id()/THREADS_PER_BLK;
944
- int il = get_sub_group_local_id()%THREADS_PER_BLK;
945
-
946
- global float * yb = y + ix*QK4_0 + BTYES_PER_THREAD_IN_BLK*il;
947
-
948
- // Registers for caching activation
949
- ACT_TY yl = 0.f;
950
-
951
- // Registers for caching quants
952
- Q_BLK_LD_TY q_blk_0 = 0, q_blk_1 = 0;
953
- #if N_DST == 4 || N_DST == 8 || N_DST == 16
954
- Q_BLK_LD_TY q_blk_2 = 0, q_blk_3 = 0;
955
- #endif
956
- #if N_DST == 8 || N_DST == 16
957
- Q_BLK_LD_TY q_blk_4 = 0, q_blk_5 = 0, q_blk_6 = 0, q_blk_7 = 0;
958
- #endif
959
-
960
- // Partial sum
961
- SUM_TY sumf = 0.f;
962
-
963
- for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/THREADS_PER_BLK) {
964
- float sumy = 0.f;;
965
-
966
- float4 tmp;
967
- tmp = read_imagef(src0_q, offset0_q + ib + 0*nb);
968
- q_blk_0 = EXTRACT_BLK_DATA(tmp, il);
969
- tmp = read_imagef(src0_q, offset0_q + ib + 1*nb);
970
- q_blk_1 = EXTRACT_BLK_DATA(tmp, il);
971
- #if N_DST == 4 || N_DST == 8 || N_DST == 16
972
- tmp = read_imagef(src0_q, offset0_q + ib + 2*nb);
973
- q_blk_2 = EXTRACT_BLK_DATA(tmp, il);
974
- tmp = read_imagef(src0_q, offset0_q + ib + 3*nb);
975
- q_blk_3 = EXTRACT_BLK_DATA(tmp, il);
976
- #endif
977
- #if N_DST == 8 || N_DST == 16
978
- tmp = read_imagef(src0_q, offset0_q + ib + 4*nb);
979
- q_blk_4 = EXTRACT_BLK_DATA(tmp, il);
980
- tmp = read_imagef(src0_q, offset0_q + ib + 5*nb);
981
- q_blk_5 = EXTRACT_BLK_DATA(tmp, il);
982
- tmp = read_imagef(src0_q, offset0_q + ib + 6*nb);
983
- q_blk_6 = EXTRACT_BLK_DATA(tmp, il);
984
- tmp = read_imagef(src0_q, offset0_q + ib + 7*nb);
985
- q_blk_7 = EXTRACT_BLK_DATA(tmp, il);
986
- #endif
987
-
988
- // Load activation
989
- #if THREADS_PER_BLK == 2 // Each thread processes 1/2 block
990
- yl.s01234567 = *(global float8 *)(yb);
991
- yl.s89abcdef = *(global float8 *)(yb + 16);
992
-
993
- sumy += yl.s0;
994
- sumy += yl.s1;
995
- sumy += yl.s2;
996
- sumy += yl.s3;
997
- sumy += yl.s4;
998
- sumy += yl.s5;
999
- sumy += yl.s6;
1000
- sumy += yl.s7;
1001
- sumy += yl.s8; yl.s8 /= 16.f;
1002
- sumy += yl.s9; yl.s9 /= 16.f;
1003
- sumy += yl.sa; yl.sa /= 16.f;
1004
- sumy += yl.sb; yl.sb /= 16.f;
1005
- sumy += yl.sc; yl.sc /= 16.f;
1006
- sumy += yl.sd; yl.sd /= 16.f;
1007
- sumy += yl.se; yl.se /= 16.f;
1008
- sumy += yl.sf; yl.sf /= 16.f;
1009
- #elif THREADS_PER_BLK == 4 // Each thread processes 1/4 block
1010
- yl.s0123 = *(global float4 *)(yb);
1011
- yl.s4567 = *(global float4 *)(yb + 16);
1012
-
1013
- sumy += yl.s0;
1014
- sumy += yl.s1;
1015
- sumy += yl.s2;
1016
- sumy += yl.s3;
1017
- sumy += yl.s4; yl.s4 /= 16.f;
1018
- sumy += yl.s5; yl.s5 /= 16.f;
1019
- sumy += yl.s6; yl.s6 /= 16.f;
1020
- sumy += yl.s7; yl.s7 /= 16.f;
1021
- #elif THREADS_PER_BLK == 8 // Each thread processes 1/8 block
1022
- yl.s01 = *(global float2 *)(yb);
1023
- yl.s23 = *(global float2 *)(yb + 16);
1024
-
1025
- sumy += yl.s0;
1026
- sumy += yl.s1;
1027
- sumy += yl.s2; yl.s2 /= 16.f;
1028
- sumy += yl.s3; yl.s3 /= 16.f;
1029
- #endif
1030
-
1031
- sumf.s0 += block_q_4_0_dot_y_flat(q_blk_0, read_imageh(src0_d, offset0_d + ib + 0*nb).s0, sumy, yl);
1032
- sumf.s1 += block_q_4_0_dot_y_flat(q_blk_1, read_imageh(src0_d, offset0_d + ib + 1*nb).s0, sumy, yl);
1033
- #if N_DST == 4 || N_DST == 8 || N_DST == 16
1034
- sumf.s2 += block_q_4_0_dot_y_flat(q_blk_2, read_imageh(src0_d, offset0_d + ib + 2*nb).s0, sumy, yl);
1035
- sumf.s3 += block_q_4_0_dot_y_flat(q_blk_3, read_imageh(src0_d, offset0_d + ib + 3*nb).s0, sumy, yl);
1036
- #endif
1037
- #if N_DST == 8 || N_DST == 16
1038
- sumf.s4 += block_q_4_0_dot_y_flat(q_blk_4, read_imageh(src0_d, offset0_d + ib + 4*nb).s0, sumy, yl);
1039
- sumf.s5 += block_q_4_0_dot_y_flat(q_blk_5, read_imageh(src0_d, offset0_d + ib + 5*nb).s0, sumy, yl);
1040
- sumf.s6 += block_q_4_0_dot_y_flat(q_blk_6, read_imageh(src0_d, offset0_d + ib + 6*nb).s0, sumy, yl);
1041
- sumf.s7 += block_q_4_0_dot_y_flat(q_blk_7, read_imageh(src0_d, offset0_d + ib + 7*nb).s0, sumy, yl);
1042
- #endif
1043
-
1044
- yb += QK4_0 * (N_SIMDWIDTH/THREADS_PER_BLK);
1045
- }
1046
-
1047
- SUM_TY tot = (SUM_TY)(
1048
- sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1)
1049
- #if N_DST == 4 || N_DST == 8 || N_DST == 16
1050
- , sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
1051
- #endif
1052
- #if N_DST == 8 || N_DST == 16
1053
- , sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5)
1054
- , sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
1055
- #endif
1056
- );
1057
-
1058
- if (get_sub_group_local_id() == 0) {
1059
- if (first_row + 0 < ne01) {
1060
- dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
1061
- }
1062
- if (first_row + 1 < ne01) {
1063
- dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
1064
- }
1065
- #if N_DST == 4 || N_DST == 8 || N_DST == 16
1066
- if (first_row + 2 < ne01) {
1067
- dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
1068
- }
1069
- if (first_row + 3 < ne01) {
1070
- dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
1071
- }
1072
- #endif
1073
- #if N_DST == 8 || N_DST == 16
1074
- if (first_row + 4 < ne01) {
1075
- dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
1076
- }
1077
- if (first_row + 5 < ne01) {
1078
- dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
1079
- }
1080
- if (first_row + 6 < ne01) {
1081
- dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
1082
- }
1083
- if (first_row + 7 < ne01) {
1084
- dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
1085
- }
1086
- #endif
1087
- }
1088
- }
1089
-
1090
- //------------------------------------------------------------------------------
1091
- // kernel_mul_mv_q6_K_f32
1092
- //------------------------------------------------------------------------------
1093
-
1094
- #undef N_DST
1095
- #undef N_SIMDGROUP
1096
- #undef N_SIMDWIDTH
1097
-
1098
- #ifdef INTEL_GPU
1099
- #define N_DST 1 // number of rows each SIMD group works on
1100
- #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
1101
- #define N_SIMDWIDTH 16 // SIMD group size
1102
- #elif defined (ADRENO_GPU)
1103
- #define N_DST 1
1104
- #define N_SIMDGROUP 2
1105
- #define N_SIMDWIDTH 64
1106
- #endif
1107
-
1108
- #define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
1109
-
1110
- #ifdef INTEL_GPU
1111
- REQD_SUBGROUP_SIZE_16
1112
- #elif defined (ADRENO_GPU)
1113
- REQD_SUBGROUP_SIZE_64
1114
- #endif
1115
- kernel void kernel_mul_mv_q6_K_f32(
1116
- global void * src0,
1117
- ulong offset0,
1118
- global float * src1,
1119
- ulong offset1,
1120
- global float * dst,
1121
- ulong offsetd,
1122
- int ne00,
1123
- int ne01,
1124
- int ne02,
1125
- int ne10,
1126
- int ne12,
1127
- int ne0,
1128
- int ne1,
1129
- int r2,
1130
- int r3
1131
- ) {
1132
- src0 = (global void*)((global char*)src0 + offset0);
1133
- src1 = (global float*)((global char*)src1 + offset1);
1134
- dst = (global float*)((global char*)dst + offsetd);
1135
-
1136
- uchar kmask1 = 0x03;
1137
- uchar kmask2 = 0x0C;
1138
- uchar kmask3 = 0x30;
1139
- uchar kmask4 = 0xC0;
1140
-
1141
- int nb = ne00/QK_K;
1142
-
1143
- int r0 = get_group_id(0);
1144
- int r1 = get_group_id(1);
1145
- int im = get_group_id(2);
1146
-
1147
- int row = N_SIMDGROUP * r0 + get_sub_group_id();
1148
-
1149
- int i12 = im%ne12;
1150
- int i13 = im/ne12;
1151
-
1152
- ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
1153
-
1154
- global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
1155
- global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
1156
-
1157
- float sumf = 0;
1158
-
1159
- // For Q6_K quantization, 16 values forms a subblock, 16 subblock forms a
1160
- // block. Values in a subblock shares a scale that is quantized with 8 bits;
1161
- // the entire block shares a single floating point scale.
1162
- // For work distribution, each thread processes a subblock (16 weights), hence
1163
- // 16 threads process a (super) block -- a subgroup thus handles SIMDWIDTH/16
1164
- // (super) blocks -- this is the block stride.
1165
- // The 16 threads that process a (super) block are split into 2 portions, each has
1166
- // 8 threads; each portion works on 8 subblocks.
1167
- // For subgroup of 16 threads, the entire subgroup works on a single (super) block
1168
- // before moving to the next (super) block. Thread0 - thread7 work on the
1169
- // first 8 subblocks; thread8 - thread15 works on the last 8 subblocks.
1170
- // Thread0 - thread3 work on subblocks 0, 2, 4, 6; thread4 - thread7 work on
1171
- // subblocks 1, 3, 5, 7. Each thread does not work on an entire subblock, but
1172
- // works on a total of 16 weight values.
1173
- int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
1174
- int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
1175
- int ip = tid/8; // first or second half of (super) block (0 or 1)
1176
- int il = tid%8; // each half has 8 parts, one per scale
1177
- int n = 4; // 4 scales at a time (and 4 sums)
1178
- int l0 = n*il; // offset into half-block, 0..28
1179
- int is = 8*ip + l0/16; // 0, 1, 8, 9
1180
-
1181
- int y_offset = 128*ip + l0;
1182
- int q_offset_l = 64*ip + l0;
1183
- int q_offset_h = 32*ip + l0;
1184
-
1185
- for (int i = ix; i < nb; i += BLOCK_STRIDE) {
1186
-
1187
- global uint8_t * q1 = x[i].ql + q_offset_l;
1188
- global uint8_t * q2 = q1 + QK_K/8;
1189
- global uint8_t * qh = x[i].qh + q_offset_h;
1190
- global int8_t * sc = x[i].scales + is;
1191
-
1192
- global float * y = yy + i * QK_K + y_offset;
1193
-
1194
- float dall = x[i].d;
1195
-
1196
- float4 sums = {0.f, 0.f, 0.f, 0.f};
1197
-
1198
- sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & kmask1) << 4)) - 32.f);
1199
- sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & kmask2) << 2)) - 32.f);
1200
- sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & kmask3) << 0)) - 32.f);
1201
- sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & kmask4) >> 2)) - 32.f);
1202
-
1203
- sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & kmask1) << 4)) - 32.f);
1204
- sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & kmask2) << 2)) - 32.f);
1205
- sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & kmask3) << 0)) - 32.f);
1206
- sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & kmask4) >> 2)) - 32.f);
1207
-
1208
- sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & kmask1) << 4)) - 32.f);
1209
- sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & kmask2) << 2)) - 32.f);
1210
- sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & kmask3) << 0)) - 32.f);
1211
- sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & kmask4) >> 2)) - 32.f);
1212
-
1213
- sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & kmask1) << 4)) - 32.f);
1214
- sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & kmask2) << 2)) - 32.f);
1215
- sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & kmask3) << 0)) - 32.f);
1216
- sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & kmask4) >> 2)) - 32.f);
1217
-
1218
- sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
1219
- }
1220
-
1221
- float tot = sub_group_reduce_add(sumf);
1222
- if (get_sub_group_local_id() == 0) {
1223
- dst[r1*ne0 + im*ne0*ne1 + row] = tot;
1224
- }
1225
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl DELETED
@@ -1,139 +0,0 @@
1
- // src0_q, src0_d, src1 are transposed as a preprocessing step
2
- // 4-bit weights are transposed in groups of 4 (unsigned short int)
3
- // consider weights originally "next to each other", now "on top of each other"
4
- // each fiber computes a 8x4 tile of output elements
5
- // using unshuffled weights
6
-
7
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
8
- #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
9
-
10
- #ifdef cl_qcom_reqd_sub_group_size
11
- #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
12
- #define ADRENO_GPU 1
13
- #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
14
- #endif
15
-
16
- #ifdef ADRENO_GPU
17
- REQD_SUBGROUP_SIZE_128
18
- #endif
19
-
20
- kernel void kernel_mul_mat_Ab_Bi_8x4(
21
- global const ushort * src0_q, // quantized A
22
- global const half * src0_d, // A scales
23
- __read_only image1d_buffer_t src1, // B (1d image)
24
- global float * dst, // C
25
- int m, // M
26
- int n, // N with padding
27
- int k, // K
28
- int n_no_padding // N without padding
29
- ) {
30
-
31
- int m_4 = m >> 2;
32
- int n_4 = n >> 2;
33
-
34
- int gy = get_global_id(0);
35
- int gx = get_global_id(1);
36
- int gx_2 = gx << 2;
37
-
38
- half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
39
- half8 B; // registers for activations
40
- half4 dequantized_weights; // registers for dequantized weights
41
- __global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
42
- __global const half* scale_ptr = src0_d + gx_2; // pointer for scales
43
-
44
- for(int i=0; i<k; i+=4){ //loop through K dimension
45
-
46
- B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
47
- B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
48
-
49
- // keep (i/4) and (i/32) in parenthesis, rounds down
50
- // load 4 consecutive groups of 4 weights
51
- ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
52
-
53
- // load 4 consecutive scales
54
- half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
55
-
56
- // j=0
57
- dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
58
- dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
59
- dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
60
- dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
61
- c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
62
- c1 += B * dequantized_weights.s1;
63
- c2 += B * dequantized_weights.s2;
64
- c3 += B * dequantized_weights.s3;
65
-
66
- // j=1
67
- B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
68
- B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
69
- dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
70
- dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
71
- dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
72
- dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
73
- c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
74
- c1 += B * dequantized_weights.s1;
75
- c2 += B * dequantized_weights.s2;
76
- c3 += B * dequantized_weights.s3;
77
-
78
- // j=2
79
- B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
80
- B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
81
- dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
82
- dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
83
- dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
84
- dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
85
- c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
86
- c1 += B * dequantized_weights.s1;
87
- c2 += B * dequantized_weights.s2;
88
- c3 += B * dequantized_weights.s3;
89
-
90
- // j=3
91
- B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
92
- B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
93
- dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
94
- dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
95
- dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
96
- dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
97
- c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
98
- c1 += B * dequantized_weights.s1;
99
- c2 += B * dequantized_weights.s2;
100
- c3 += B * dequantized_weights.s3;
101
- }
102
-
103
- int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
104
-
105
- // conditional check if store is to a valid location. Required when N is not a multiple of 8
106
- // if statements allow registers to be reused for each store
107
- // provides a performance boost due to reduced register footprint, which increases number of concurrent waves
108
- if(idx+3 < m*n_no_padding){
109
- vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
110
- idx += m;
111
- }
112
- if(idx+3 < m*n_no_padding){
113
- vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
114
- idx += m;
115
- }
116
- if(idx+3 < m*n_no_padding){
117
- vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
118
- idx += m;
119
- }
120
- if(idx+3 < m*n_no_padding){
121
- vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
122
- idx += m;
123
- }
124
- if(idx+3 < m*n_no_padding){
125
- vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
126
- idx += m;
127
- }
128
- if(idx+3 < m*n_no_padding){
129
- vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
130
- idx += m;
131
- }
132
- if(idx+3 < m*n_no_padding){
133
- vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
134
- idx += m;
135
- }
136
- if(idx+3 < m*n_no_padding){
137
- vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
138
- }
139
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl DELETED
@@ -1,26 +0,0 @@
1
- // 16-bit transpose, loading/storing a 4x4 tile of elements
2
-
3
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
4
-
5
- kernel void kernel_transpose_16(
6
- __read_only image1d_buffer_t input,
7
- __write_only image1d_buffer_t output,
8
- const uint rows,
9
- const uint cols
10
- ) {
11
-
12
- const int i = get_global_id(0);
13
- const int j = get_global_id(1);
14
- const int i_2 = i<<2;
15
- const int j_2 = j<<2;
16
-
17
- half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
18
- half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
19
- half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
20
- half4 temp3 = read_imageh(input, (j_2+3)*cols+i);
21
-
22
- write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
23
- write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
24
- write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
25
- write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl DELETED
@@ -1,25 +0,0 @@
1
- // 32-bit transpose, loading/storing a 4x4 tile of elements
2
-
3
- kernel void kernel_transpose_32(
4
- __read_only image1d_buffer_t input,
5
- __write_only image1d_buffer_t output,
6
- const uint rows,
7
- const uint cols
8
- ) {
9
-
10
- const int i = get_global_id(0);
11
- const int j = get_global_id(1);
12
- const int i_2 = i<<2;
13
- const int j_2 = j<<2;
14
-
15
- float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
16
- float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
17
- float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
18
- float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
19
-
20
- write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
21
- write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
22
- write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
23
- write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
24
-
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl DELETED
@@ -1,35 +0,0 @@
1
- // 32-bit transpose, loading/storing a 4x4 tile of elements
2
- // Only used for activations
3
- // converts to FP16
4
- // also adds zero padding for non multiple of 8 prompt lengths
5
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
-
7
- kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
8
-
9
- const int i = get_global_id(0);
10
- const int j = get_global_id(1);
11
- const int i_2 = i<<2;
12
- const int j_2 = j<<2;
13
- half4 temp0 = {0,0,0,0}; // initialize outputs to 0
14
- half4 temp1 = {0,0,0,0};
15
- half4 temp2 = {0,0,0,0};
16
- half4 temp3 = {0,0,0,0};
17
-
18
- if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
19
- temp0 = read_imageh(input, (j_2+0)*cols+i);
20
- }
21
- if((j_2+1)*cols+i*4+3 < rows*cols*16){
22
- temp1 = read_imageh(input, (j_2+1)*cols+i);
23
- }
24
- if((j_2+2)*cols+i*4+3 < rows*cols*16){
25
- temp2 = read_imageh(input, (j_2+2)*cols+i);
26
- }
27
- if((j_2+3)*cols+i*4+3 < rows*cols*16){
28
- temp3 = read_imageh(input, (j_2+3)*cols+i);
29
- }
30
-
31
- write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
32
- write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
33
- write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
34
- write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
35
- }