lhez commited on
Commit
d0a458b
·
1 Parent(s): dcb106f

opencl: add `mul_mv_id_q4_0_f32_8x_flat` (llama/14003)

Browse files
ggml/src/ggml-opencl/CMakeLists.txt CHANGED
@@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
80
  mul_mv_q4_0_f32_1d_8x_flat
81
  mul_mv_q4_0_f32_1d_16x_flat
82
  mul_mv_q6_k
 
83
  mul
84
  norm
85
  relu
 
80
  mul_mv_q4_0_f32_1d_8x_flat
81
  mul_mv_q4_0_f32_1d_16x_flat
82
  mul_mv_q6_k
83
+ mul_mv_id_q4_0_f32_8x_flat
84
  mul
85
  norm
86
  relu
ggml/src/ggml-opencl/ggml-opencl.cpp CHANGED
@@ -321,6 +321,7 @@ struct ggml_backend_opencl_context {
321
  cl_program program_upscale;
322
  cl_program program_concat;
323
  cl_program program_tsembd;
 
324
 
325
  cl_kernel kernel_add, kernel_add_row;
326
  cl_kernel kernel_mul, kernel_mul_row;
@@ -366,6 +367,7 @@ struct ggml_backend_opencl_context {
366
  cl_kernel kernel_concat_f32_contiguous;
367
  cl_kernel kernel_concat_f32_non_contiguous;
368
  cl_kernel kernel_timestep_embedding;
 
369
 
370
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
371
  // Transpose kernels
@@ -1112,7 +1114,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1112
  GGML_LOG_CONT(".");
1113
  }
1114
 
1115
- // repeat
1116
  {
1117
  #ifdef GGML_OPENCL_EMBED_KERNELS
1118
  const std::string kernel_src {
@@ -1256,6 +1258,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1256
  }
1257
  }
1258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1259
  // Adreno kernels
1260
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1261
  // transpose
@@ -2178,6 +2196,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2178
  return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2179
  }
2180
  return false;
 
 
 
 
 
 
 
2181
  case GGML_OP_RESHAPE:
2182
  case GGML_OP_VIEW:
2183
  case GGML_OP_PERMUTE:
@@ -5536,6 +5561,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5536
  }
5537
  }
5538
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5539
  static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5540
  GGML_ASSERT(src0);
5541
  GGML_ASSERT(src0->extra);
@@ -6444,6 +6599,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
6444
  }
6445
  func = ggml_cl_mul_mat;
6446
  break;
 
 
 
 
 
 
6447
  case GGML_OP_SCALE:
6448
  if (!any_on_device) {
6449
  return false;
 
321
  cl_program program_upscale;
322
  cl_program program_concat;
323
  cl_program program_tsembd;
324
+ cl_program program_mul_mv_id_q4_0_f32_8x_flat;
325
 
326
  cl_kernel kernel_add, kernel_add_row;
327
  cl_kernel kernel_mul, kernel_mul_row;
 
367
  cl_kernel kernel_concat_f32_contiguous;
368
  cl_kernel kernel_concat_f32_non_contiguous;
369
  cl_kernel kernel_timestep_embedding;
370
+ cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
371
 
372
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
373
  // Transpose kernels
 
1114
  GGML_LOG_CONT(".");
1115
  }
1116
 
1117
+ // repeat
1118
  {
1119
  #ifdef GGML_OPENCL_EMBED_KERNELS
1120
  const std::string kernel_src {
 
1258
  }
1259
  }
1260
 
1261
+ // mul_mv_id_q4_0_f32_8x_flat
1262
+ {
1263
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1264
+ const std::string kernel_src {
1265
+ #include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
1266
+ };
1267
+ #else
1268
+ const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
1269
+ #endif
1270
+ backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
1271
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1272
+
1273
+ CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
1274
+ GGML_LOG_CONT(".");
1275
+ }
1276
+
1277
  // Adreno kernels
1278
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1279
  // transpose
 
2196
  return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2197
  }
2198
  return false;
2199
+ case GGML_OP_MUL_MAT_ID:
2200
+ if (op->src[0]->type == GGML_TYPE_Q4_0) {
2201
+ if (op->src[1]->type == GGML_TYPE_F32) {
2202
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2203
+ }
2204
+ }
2205
+ return false;
2206
  case GGML_OP_RESHAPE:
2207
  case GGML_OP_VIEW:
2208
  case GGML_OP_PERMUTE:
 
5561
  }
5562
  }
5563
 
5564
+ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5565
+ GGML_ASSERT(src0);
5566
+ GGML_ASSERT(src0->extra);
5567
+ GGML_ASSERT(src1);
5568
+ GGML_ASSERT(src1->extra);
5569
+ GGML_ASSERT(dst);
5570
+ GGML_ASSERT(dst->extra);
5571
+
5572
+ const ggml_tensor * src2 = dst->src[2];
5573
+ GGML_ASSERT(src2);
5574
+ GGML_ASSERT(src2->extra);
5575
+
5576
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5577
+ cl_command_queue queue = backend_ctx->queue;
5578
+
5579
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5580
+ ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
5581
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5582
+
5583
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
5584
+ cl_ulong offset2 = extra2->offset + src2->view_offs;
5585
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
5586
+
5587
+ #ifdef GGML_OPENCL_SOA_Q
5588
+ ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
5589
+ #endif
5590
+
5591
+ const int ne00 = src0->ne[0];
5592
+ const int ne01 = src0->ne[1];
5593
+ const int ne02 = src0->ne[2];
5594
+ const int ne03 = src0->ne[3];
5595
+
5596
+ const cl_ulong nb00 = src0->nb[0];
5597
+ const cl_ulong nb02 = src0->nb[2];
5598
+
5599
+ const int ne10 = src1->ne[0];
5600
+ const int ne11 = src1->ne[1];
5601
+ const int ne12 = src1->ne[2];
5602
+ const int ne13 = src1->ne[3];
5603
+
5604
+ const cl_ulong nb11 = src1->nb[1];
5605
+ const cl_ulong nb12 = src1->nb[2];
5606
+
5607
+ const int ne20 = src2->ne[0];
5608
+ const int ne21 = src2->ne[1];
5609
+
5610
+ const cl_ulong nb21 = src2->nb[1];
5611
+
5612
+ const int ne0 = dst->ne[0];
5613
+ const int ne1 = dst->ne[1];
5614
+
5615
+ const int r2 = ne12/ne02;
5616
+ const int r3 = ne13/ne03;
5617
+ const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
5618
+
5619
+ GGML_ASSERT(ne00 == ne10);
5620
+
5621
+ int sgs = 32; // subgroup size
5622
+ int nsg = 1; // number of subgroups
5623
+ int nrows = 1; // number of row in src1
5624
+ int ndst = 4; // number of values produced by each subgroup
5625
+
5626
+ cl_kernel kernel;
5627
+
5628
+ // subgroup mat vec
5629
+ switch (src0->type) {
5630
+ case GGML_TYPE_Q4_0: {
5631
+ kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
5632
+
5633
+ if (backend_ctx->gpu_family == INTEL) {
5634
+ sgs = 16;
5635
+ nsg = 1;
5636
+ ndst = 8;
5637
+ } else if (backend_ctx->gpu_family == ADRENO) {
5638
+ sgs = 64;
5639
+ nsg = 1;
5640
+ ndst = 8;
5641
+ } else {
5642
+ GGML_ASSERT(false && "TODO: Unknown GPU");
5643
+ }
5644
+
5645
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
5646
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
5647
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5648
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5649
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
5650
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
5651
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
5652
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
5653
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
5654
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
5655
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
5656
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
5657
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
5658
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
5659
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
5660
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
5661
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
5662
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
5663
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
5664
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
5665
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
5666
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
5667
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
5668
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
5669
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
5670
+
5671
+ break;
5672
+ }
5673
+ default:
5674
+ GGML_ASSERT(false && "not implemented");;
5675
+ }
5676
+
5677
+ int _ne1 = 1;
5678
+ int ne123 = dst_rows;
5679
+
5680
+ size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
5681
+ size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
5682
+
5683
+ #ifdef GGML_OPENCL_PROFILING
5684
+ cl_event evt;
5685
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5686
+
5687
+ g_profiling_info.emplace_back();
5688
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5689
+ #else
5690
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5691
+ #endif
5692
+ }
5693
+
5694
  static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5695
  GGML_ASSERT(src0);
5696
  GGML_ASSERT(src0->extra);
 
6599
  }
6600
  func = ggml_cl_mul_mat;
6601
  break;
6602
+ case GGML_OP_MUL_MAT_ID:
6603
+ if (!any_on_device) {
6604
+ return false;
6605
+ }
6606
+ func = ggml_cl_mul_mat_id;
6607
+ break;
6608
  case GGML_OP_SCALE:
6609
  if (!any_on_device) {
6610
  return false;
ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK4_0 32
22
+
23
+ typedef char int8_t;
24
+ typedef uchar uint8_t;
25
+ typedef short int16_t;
26
+ typedef ushort uint16_t;
27
+ typedef int int32_t;
28
+ typedef uint uint32_t;
29
+
30
+ //------------------------------------------------------------------------------
31
+ // block_q4_0
32
+ //------------------------------------------------------------------------------
33
+ struct block_q4_0
34
+ {
35
+ half d;
36
+ uint8_t qs[QK4_0 / 2];
37
+ };
38
+
39
+ // This function requires the original shuffled weights.
40
+ // As a reminder, the original weights are shuffled so that (q[0], q[16]) are
41
+ // packed together in a byte, so are (q[1], q[17]) and so on.
42
+ inline float block_q_4_0_dot_y_flat(
43
+ global uchar * x,
44
+ global half * dh,
45
+ float sumy,
46
+ float16 yl,
47
+ int il
48
+ ) {
49
+ float d = *dh;
50
+ global ushort * qs = ((global ushort *)x + il/2);
51
+ float acc = 0.f;
52
+
53
+ acc += yl.s0 * (qs[0] & 0x000F);
54
+ acc += yl.s1 * (qs[0] & 0x0F00);
55
+ acc += yl.s8 * (qs[0] & 0x00F0);
56
+ acc += yl.s9 * (qs[0] & 0xF000);
57
+
58
+ acc += yl.s2 * (qs[1] & 0x000F);
59
+ acc += yl.s3 * (qs[1] & 0x0F00);
60
+ acc += yl.sa * (qs[1] & 0x00F0);
61
+ acc += yl.sb * (qs[1] & 0xF000);
62
+
63
+ acc += yl.s4 * (qs[2] & 0x000F);
64
+ acc += yl.s5 * (qs[2] & 0x0F00);
65
+ acc += yl.sc * (qs[2] & 0x00F0);
66
+ acc += yl.sd * (qs[2] & 0xF000);
67
+
68
+ acc += yl.s6 * (qs[3] & 0x000F);
69
+ acc += yl.s7 * (qs[3] & 0x0F00);
70
+ acc += yl.se * (qs[3] & 0x00F0);
71
+ acc += yl.sf * (qs[3] & 0xF000);
72
+
73
+ return d * (sumy * -8.f + acc);
74
+ }
75
+
76
+ //
77
+ // This variant outputs 8 values.
78
+ //
79
+ #undef N_DST
80
+ #undef N_SIMDGROUP
81
+ #undef N_SIMDWIDTH
82
+
83
+ #ifdef INTEL_GPU
84
+ #define N_DST 8 // each SIMD group works on 8 rows
85
+ #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
86
+ #define N_SIMDWIDTH 16 // subgroup size
87
+ #elif defined (ADRENO_GPU)
88
+ #define N_DST 8
89
+ #define N_SIMDGROUP 1
90
+ #define N_SIMDWIDTH 64
91
+ #endif
92
+
93
+ inline void mul_vec_q_n_f32_8x_flat(
94
+ global char * src0_q,
95
+ global half * src0_d,
96
+ global float * src1,
97
+ global float * dst,
98
+ int ne00,
99
+ int ne01,
100
+ int ne02,
101
+ int ne10,
102
+ int ne12,
103
+ int ne0,
104
+ int ne1,
105
+ int r2,
106
+ int r3
107
+ ) {
108
+ const ulong nb = ne00/QK4_0;
109
+
110
+ int r0 = get_group_id(0);
111
+ int r1 = get_group_id(1);
112
+ int im = 0;
113
+
114
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
115
+
116
+ int i12 = im%ne12;
117
+ int i13 = im/ne12;
118
+
119
+ // The number of scales is the same as the number of blocks.
120
+ ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
121
+ // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
122
+ ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
123
+
124
+ global uchar * x = (global uchar *) src0_q + offset0_q;
125
+ global half * d = (global half *) src0_d + offset0_d;
126
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
127
+
128
+ float16 yl;
129
+ float8 sumf = 0.f;
130
+
131
+ int ix = get_sub_group_local_id()/2;
132
+ int il = 8*(get_sub_group_local_id()%2);
133
+
134
+ global float * yb = y + ix*QK4_0 + il;
135
+
136
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
137
+ float sumy = 0.f;
138
+
139
+ sumy += yb[0];
140
+ sumy += yb[1];
141
+ sumy += yb[2];
142
+ sumy += yb[3];
143
+ sumy += yb[4];
144
+ sumy += yb[5];
145
+ sumy += yb[6];
146
+ sumy += yb[7];
147
+
148
+ sumy += yb[16];
149
+ sumy += yb[17];
150
+ sumy += yb[18];
151
+ sumy += yb[19];
152
+ sumy += yb[20];
153
+ sumy += yb[21];
154
+ sumy += yb[22];
155
+ sumy += yb[23];
156
+
157
+ yl.s0 = yb[0];
158
+ yl.s1 = yb[1]/256.f;
159
+
160
+ yl.s2 = yb[2];
161
+ yl.s3 = yb[3]/256.f;
162
+
163
+ yl.s4 = yb[4];
164
+ yl.s5 = yb[5]/256.f;
165
+
166
+ yl.s6 = yb[6];
167
+ yl.s7 = yb[7]/256.f;
168
+
169
+ yl.s8 = yb[16]/16.f;
170
+ yl.s9 = yb[17]/4096.f;
171
+
172
+ yl.sa = yb[18]/16.f;
173
+ yl.sb = yb[19]/4096.f;
174
+
175
+ yl.sc = yb[20]/16.f;
176
+ yl.sd = yb[21]/4096.f;
177
+
178
+ yl.se = yb[22]/16.f;
179
+ yl.sf = yb[23]/4096.f;
180
+
181
+ sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
182
+ sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
183
+ sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
184
+ sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
185
+
186
+ sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
187
+ sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
188
+ sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
189
+ sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
190
+
191
+ yb += QK4_0 * (N_SIMDWIDTH/2);
192
+ }
193
+
194
+ float8 tot = (float8)(
195
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
196
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
197
+ sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
198
+ sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
199
+ );
200
+
201
+ if (get_sub_group_local_id() == 0) {
202
+ if (first_row + 0 < ne01) {
203
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
204
+ }
205
+ if (first_row + 1 < ne01) {
206
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
207
+ }
208
+ if (first_row + 2 < ne01) {
209
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
210
+ }
211
+ if (first_row + 3 < ne01) {
212
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
213
+ }
214
+
215
+ if (first_row + 4 < ne01) {
216
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
217
+ }
218
+ if (first_row + 5 < ne01) {
219
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
220
+ }
221
+ if (first_row + 6 < ne01) {
222
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
223
+ }
224
+ if (first_row + 7 < ne01) {
225
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
226
+ }
227
+ }
228
+ }
229
+
230
+ #ifdef INTEL_GPU
231
+ REQD_SUBGROUP_SIZE_16
232
+ #elif defined (ADRENO_GPU)
233
+ REQD_SUBGROUP_SIZE_64
234
+ #endif
235
+ kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
236
+ global char * src0_q,
237
+ global half * src0_d,
238
+ global float * src1,
239
+ ulong offset1,
240
+ global char * src2,
241
+ ulong offset2,
242
+ global float * dst,
243
+ ulong offsetd,
244
+ int ne00,
245
+ int ne01,
246
+ int ne02,
247
+ ulong nb00,
248
+ ulong nb02,
249
+ int ne10,
250
+ int ne11,
251
+ int ne12,
252
+ ulong nb11,
253
+ ulong nb12,
254
+ int ne20,
255
+ int ne21,
256
+ ulong nb21,
257
+ int ne0,
258
+ int ne1,
259
+ int r2,
260
+ int r3
261
+ ) {
262
+ src1 = (global float *)((global char *)src1 + offset1);
263
+ src2 = (global char *)((global char *)src2 + offset2);
264
+ dst = (global float *)((global char *)dst + offsetd);
265
+
266
+ const int iid1 = get_group_id(2)/ne20;
267
+ const int idx = get_group_id(2)%ne20;
268
+
269
+ const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
270
+
271
+ const int i11 = idx%ne11;
272
+ const int i12 = iid1;
273
+
274
+ const int i1 = idx;
275
+ const int i2 = i12;
276
+
277
+ global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
278
+ global half * src0_d_cur = src0_d + (i02*nb02/nb00);
279
+ global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
280
+ global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
281
+
282
+ mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
283
+ }