Nicolò Scipione commited on
Commit
2e59a96
·
1 Parent(s): 0bcd751

sycl: add usage of enqueue_functions extension (llama/14244)

Browse files

* Add header and namespace to use enqueue_functions extension

* Convert submit and parallel_for to use new extension in convert.cpp

* Convert submit and parallel_for to use extension in ggml-sycl.cpp

* Convert submit and parallel_for to use extension in gla.cpp

* Convert submit and parallel_for in mmq.cpp

* Convert submit and parallel_for in mmvq.cpp

* Convert submit and parallel_for in remaining files

* Convert all simple parallel_for to nd_launch from enqueue_functions
extension

* Wrapping extension in general function

Create a general function that enable the enqueue_functions extension if
it is enable in the compiler, otherwise call the general SYCL function
to launch kernels.

---------

Signed-off-by: nscipione <[email protected]>

ggml/src/ggml-sycl/binbcast.cpp CHANGED
@@ -225,9 +225,9 @@ struct bin_bcast_sycl {
225
  dpct::has_capability_or_fail(stream->get_device(),
226
  {sycl::aspect::fp16});
227
 
228
- stream->parallel_for(
229
- sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
230
- sycl::range<3>(1, 1, block_size),
231
  sycl::range<3>(1, 1, block_size)),
232
  [=](sycl::nd_item<3> item_ct1) {
233
  k_bin_bcast_unravel<bin_op>(
@@ -246,9 +246,8 @@ struct bin_bcast_sycl {
246
  dpct::has_capability_or_fail(stream->get_device(),
247
  {sycl::aspect::fp16});
248
 
249
- stream->parallel_for(
250
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
251
- [=](sycl::nd_item<3> item_ct1) {
252
  k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
253
  ne2, ne3, ne10, ne11, ne12, ne13,
254
  s1, s2, s3, s01, s02, s03, s11, s12, s13,
 
225
  dpct::has_capability_or_fail(stream->get_device(),
226
  {sycl::aspect::fp16});
227
 
228
+ sycl_parallel_for(
229
+ stream,
230
+ sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size),
231
  sycl::range<3>(1, 1, block_size)),
232
  [=](sycl::nd_item<3> item_ct1) {
233
  k_bin_bcast_unravel<bin_op>(
 
246
  dpct::has_capability_or_fail(stream->get_device(),
247
  {sycl::aspect::fp16});
248
 
249
+ sycl_parallel_for(
250
+ stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
251
  k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
252
  ne2, ne3, ne10, ne11, ne12, ne13,
253
  s1, s2, s3, s01, s02, s03, s11, s12, s13,
ggml/src/ggml-sycl/concat.cpp CHANGED
@@ -89,33 +89,24 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
89
  sycl::range<3> gridDim(ne2, ne1, num_blocks);
90
  switch (dim) {
91
  case 0:
92
- stream->parallel_for(
93
- sycl::nd_range<3>(gridDim *
94
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
95
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
96
- [=](sycl::nd_item<3> item_ct1) {
97
- concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
98
- });
99
- break;
100
  case 1:
101
- stream->parallel_for(
102
- sycl::nd_range<3>(gridDim *
103
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
104
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
105
- [=](sycl::nd_item<3> item_ct1) {
106
- concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
107
- });
108
- break;
109
  // dim >=2 will be dispatched to the default path
110
  default:
111
- stream->parallel_for(
112
- sycl::nd_range<3>(gridDim *
113
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
114
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
115
- [=](sycl::nd_item<3> item_ct1) {
116
- concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
117
- });
118
- break;
119
  }
120
  }
121
 
@@ -129,33 +120,29 @@ static void concat_f32_sycl_non_cont(
129
  int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
130
  uint64_t nb3, int32_t dim) {
131
  sycl::range<3> gridDim(ne3, ne2, ne1);
132
- stream->parallel_for(
133
- sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)),
134
- [=](sycl::nd_item<3> item_ct1) {
135
- int64_t i3 = item_ct1.get_group(0);
136
- int64_t i2 = item_ct1.get_group(1);
137
- int64_t i1 = item_ct1.get_group(2);
138
 
139
- int64_t o[4] = {0, 0, 0, 0};
140
- o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
141
 
142
- const float *x;
143
 
144
- for (int i0 = item_ct1.get_local_id(2); i0 < ne0;
145
- i0 += item_ct1.get_local_range(2)) {
146
  if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
147
- x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 +
148
- (i0)*nb00);
149
  } else {
150
- x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 +
151
- (i1 - o[1]) * nb11 + (i0 - o[0]) * nb10);
152
  }
153
 
154
  float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
155
 
156
  *y = *x;
157
- }
158
- });
159
  }
160
 
161
  void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
 
89
  sycl::range<3> gridDim(ne2, ne1, num_blocks);
90
  switch (dim) {
91
  case 0:
92
+ sycl_parallel_for(stream,
93
+ sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
94
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
95
+ [=](sycl::nd_item<3> item_ct1) { concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); });
96
+ break;
 
 
 
97
  case 1:
98
+ sycl_parallel_for(stream,
99
+ sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
100
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
101
+ [=](sycl::nd_item<3> item_ct1) { concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); });
102
+ break;
 
 
 
103
  // dim >=2 will be dispatched to the default path
104
  default:
105
+ sycl_parallel_for(stream,
106
+ sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
107
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
108
+ [=](sycl::nd_item<3> item_ct1) { concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); });
109
+ break;
 
 
 
110
  }
111
  }
112
 
 
120
  int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
121
  uint64_t nb3, int32_t dim) {
122
  sycl::range<3> gridDim(ne3, ne2, ne1);
123
+ sycl_parallel_for(stream, sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
124
+ int64_t i3 = item_ct1.get_group(0);
125
+ int64_t i2 = item_ct1.get_group(1);
126
+ int64_t i1 = item_ct1.get_group(2);
 
 
127
 
128
+ int64_t o[4] = { 0, 0, 0, 0 };
129
+ o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
130
 
131
+ const float * x;
132
 
133
+ for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) {
 
134
  if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
135
+ x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
 
136
  } else {
137
+ x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
138
+ (i0 - o[0]) * nb10);
139
  }
140
 
141
  float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
142
 
143
  *y = *x;
144
+ }
145
+ });
146
  }
147
 
148
  void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml/src/ggml-sycl/conv.cpp CHANGED
@@ -59,16 +59,10 @@ static void conv_transpose_1d_f32_f32_sycl(
59
  const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
60
  const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
61
  const sycl::range<3> block_nums(1, 1, num_blocks);
62
- stream->parallel_for(
63
- sycl::nd_range<3>(
64
- block_nums * block_dims, block_dims),
65
- [=](sycl::nd_item<3> item_ct1) {
66
- conv_transpose_1d_kernel(
67
- s0, output_size,
68
- src0_ne0, src0_ne1, src0_ne2,
69
- src1_ne0, dst_ne0,
70
- src0, src1, dst, item_ct1);
71
- });
72
  }
73
 
74
  void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
 
59
  const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
60
  const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
61
  const sycl::range<3> block_nums(1, 1, num_blocks);
62
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
63
+ conv_transpose_1d_kernel(s0, output_size, src0_ne0, src0_ne1, src0_ne2, src1_ne0, dst_ne0, src0, src1, dst,
64
+ item_ct1);
65
+ });
 
 
 
 
 
 
66
  }
67
 
68
  void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml/src/ggml-sycl/convert.cpp CHANGED
@@ -33,14 +33,11 @@ static void dequantize_block_sycl(const void *__restrict__ vx,
33
  {
34
  dpct::has_capability_or_fail(stream->get_device(),
35
  {sycl::aspect::fp16});
36
- stream->parallel_for(
37
- sycl::nd_range<3>(
38
- sycl::range<3>(1, 1, num_blocks) *
39
- sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
40
- sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
41
- [=](sycl::nd_item<3> item_ct1) {
42
- dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
43
- });
44
  }
45
  }
46
 
@@ -53,24 +50,18 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
53
  dpct::has_capability_or_fail(stream->get_device(),
54
  {sycl::aspect::fp16});
55
 
56
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
57
- sycl::range<3>(1, 1, 64),
58
- sycl::range<3>(1, 1, 64)),
59
- [=](sycl::nd_item<3> item_ct1) {
60
- dequantize_block_q2_K(vx, y, item_ct1);
61
- });
62
  }
63
  #else
64
  {
65
  dpct::has_capability_or_fail(stream->get_device(),
66
  {sycl::aspect::fp16});
67
 
68
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
69
- sycl::range<3>(1, 1, 32),
70
- sycl::range<3>(1, 1, 32)),
71
- [=](sycl::nd_item<3> item_ct1) {
72
- dequantize_block_q2_K(vx, y, item_ct1);
73
- });
74
  }
75
 
76
  #endif
@@ -85,24 +76,18 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
85
  dpct::has_capability_or_fail(stream->get_device(),
86
  {sycl::aspect::fp16});
87
 
88
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
89
- sycl::range<3>(1, 1, 64),
90
- sycl::range<3>(1, 1, 64)),
91
- [=](sycl::nd_item<3> item_ct1) {
92
- dequantize_block_q3_K(vx, y, item_ct1);
93
- });
94
  }
95
  #else
96
  {
97
  dpct::has_capability_or_fail(stream->get_device(),
98
  {sycl::aspect::fp16});
99
 
100
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
101
- sycl::range<3>(1, 1, 32),
102
- sycl::range<3>(1, 1, 32)),
103
- [=](sycl::nd_item<3> item_ct1) {
104
- dequantize_block_q3_K(vx, y, item_ct1);
105
- });
106
  }
107
  #endif
108
  }
@@ -116,12 +101,9 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
116
  dpct::has_capability_or_fail(stream->get_device(),
117
  {sycl::aspect::fp16});
118
 
119
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
120
- sycl::range<3>(1, 1, 32),
121
- sycl::range<3>(1, 1, 32)),
122
- [=](sycl::nd_item<3> item_ct1) {
123
- dequantize_block_q4_0(vx, y, nb32, item_ct1);
124
- });
125
  }
126
  }
127
 
@@ -135,13 +117,12 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int
135
  int constexpr WARP_K = WARP_SIZE * QK4_0;
136
  const int n_warp = (k + WARP_K - 1) / WARP_K;
137
  GGML_ASSERT(k % 2 == 0);
138
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
139
- sycl::range<3>(1, 1, WARP_SIZE),
140
- sycl::range<3>(1, 1, WARP_SIZE)),
141
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
142
- dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
143
- });
144
-
145
  }
146
 
147
  template <typename dst_t>
@@ -153,12 +134,9 @@ static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
153
  dpct::has_capability_or_fail(stream->get_device(),
154
  {sycl::aspect::fp16});
155
 
156
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
157
- sycl::range<3>(1, 1, 32),
158
- sycl::range<3>(1, 1, 32)),
159
- [=](sycl::nd_item<3> item_ct1) {
160
- dequantize_block_q4_1(vx, y, nb32, item_ct1);
161
- });
162
  }
163
  }
164
 
@@ -171,14 +149,13 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
171
  dpct::has_capability_or_fail(stream->get_device(),
172
  {sycl::aspect::fp16});
173
 
174
- stream->submit([&](sycl::handler &cgh) {
175
  sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
176
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
177
- sycl::range<3>(1, 1, 32),
178
- sycl::range<3>(1, 1, 32)),
179
- [=](sycl::nd_item<3> item_ct1) {
180
- dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
181
- });
182
  });
183
  }
184
  }
@@ -191,13 +168,13 @@ static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const i
191
 
192
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
193
 
194
- stream->submit([&](sycl::handler & cgh) {
195
  sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
196
 
197
- cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
198
- [=](sycl::nd_item<1> item_ct1) {
199
- dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
200
- });
201
  });
202
  }
203
 
@@ -210,24 +187,18 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
210
  dpct::has_capability_or_fail(stream->get_device(),
211
  {sycl::aspect::fp16});
212
 
213
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
214
- sycl::range<3>(1, 1, 64),
215
- sycl::range<3>(1, 1, 64)),
216
- [=](sycl::nd_item<3> item_ct1) {
217
- dequantize_block_q5_K(vx, y, item_ct1);
218
- });
219
  }
220
  #else
221
  {
222
  dpct::has_capability_or_fail(stream->get_device(),
223
  {sycl::aspect::fp16});
224
 
225
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
226
- sycl::range<3>(1, 1, 32),
227
- sycl::range<3>(1, 1, 32)),
228
- [=](sycl::nd_item<3> item_ct1) {
229
- dequantize_block_q5_K(vx, y, item_ct1);
230
- });
231
  }
232
 
233
  #endif
@@ -242,24 +213,18 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
242
  dpct::has_capability_or_fail(stream->get_device(),
243
  {sycl::aspect::fp16});
244
 
245
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
246
- sycl::range<3>(1, 1, 64),
247
- sycl::range<3>(1, 1, 64)),
248
- [=](sycl::nd_item<3> item_ct1) {
249
- dequantize_block_q6_K(vx, y, item_ct1);
250
- });
251
  }
252
  #else
253
  {
254
  dpct::has_capability_or_fail(stream->get_device(),
255
  {sycl::aspect::fp16});
256
 
257
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
258
- sycl::range<3>(1, 1, 32),
259
- sycl::range<3>(1, 1, 32)),
260
- [=](sycl::nd_item<3> item_ct1) {
261
- dequantize_block_q6_K(vx, y, item_ct1);
262
- });
263
  }
264
 
265
  #endif
@@ -271,9 +236,9 @@ static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const i
271
 
272
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
273
 
274
- stream->parallel_for(
275
- sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
276
- [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
277
  }
278
 
279
  template <typename dst_t>
@@ -284,15 +249,10 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
284
  dpct::has_capability_or_fail(stream->get_device(),
285
  {sycl::aspect::fp16});
286
 
287
- stream->submit([&](sycl::handler &cgh) {
288
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
289
- sycl::range<3>(1, 1, 32),
290
- sycl::range<3>(1, 1, 32)),
291
- [=](sycl::nd_item<3> item_ct1) {
292
- dequantize_block_iq1_s(
293
- vx, y, item_ct1, iq1s_grid_gpu
294
- );
295
- });
296
  });
297
  }
298
  }
@@ -305,15 +265,10 @@ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
305
  dpct::has_capability_or_fail(stream->get_device(),
306
  {sycl::aspect::fp16});
307
 
308
- stream->submit([&](sycl::handler &cgh) {
309
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
310
- sycl::range<3>(1, 1, 32),
311
- sycl::range<3>(1, 1, 32)),
312
- [=](sycl::nd_item<3> item_ct1) {
313
- dequantize_block_iq1_m(
314
- vx, y, item_ct1, iq1s_grid_gpu
315
- );
316
- });
317
  });
318
  }
319
  }
@@ -326,15 +281,12 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t
326
  dpct::has_capability_or_fail(stream->get_device(),
327
  {sycl::aspect::fp16});
328
 
329
- stream->submit([&](sycl::handler &cgh) {
330
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
331
- sycl::range<3>(1, 1, 32),
332
- sycl::range<3>(1, 1, 32)),
333
- [=](sycl::nd_item<3> item_ct1) {
334
- dequantize_block_iq2_xxs(
335
- vx, y, item_ct1, iq2xxs_grid,
336
- ksigns_iq2xs, kmask_iq2xs);
337
- });
338
  });
339
  }
340
  }
@@ -347,15 +299,12 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k
347
  dpct::has_capability_or_fail(stream->get_device(),
348
  {sycl::aspect::fp16});
349
 
350
- stream->submit([&](sycl::handler &cgh) {
351
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
352
- sycl::range<3>(1, 1, 32),
353
- sycl::range<3>(1, 1, 32)),
354
- [=](sycl::nd_item<3> item_ct1) {
355
- dequantize_block_iq2_xs(
356
- vx, y, item_ct1, iq2xs_grid,
357
- ksigns_iq2xs, kmask_iq2xs);
358
- });
359
  });
360
  }
361
  }
@@ -368,13 +317,10 @@ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
368
  dpct::has_capability_or_fail(stream->get_device(),
369
  {sycl::aspect::fp16});
370
 
371
- stream->submit([&](sycl::handler &cgh) {
372
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
373
- sycl::range<3>(1, 1, 32),
374
- sycl::range<3>(1, 1, 32)),
375
- [=](sycl::nd_item<3> item_ct1) {
376
- dequantize_block_iq2_s(vx, y, item_ct1);
377
- });
378
  });
379
  }
380
  }
@@ -388,15 +334,12 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t
388
  dpct::has_capability_or_fail(stream->get_device(),
389
  {sycl::aspect::fp16});
390
 
391
- stream->submit([&](sycl::handler &cgh) {
392
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
393
- sycl::range<3>(1, 1, 32),
394
- sycl::range<3>(1, 1, 32)),
395
- [=](sycl::nd_item<3> item_ct1) {
396
- dequantize_block_iq3_xxs(
397
- vx, y, item_ct1, iq3xxs_grid,
398
- ksigns_iq2xs, kmask_iq2xs);
399
- });
400
  });
401
  }
402
  }
@@ -409,14 +352,10 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
409
  dpct::has_capability_or_fail(stream->get_device(),
410
  {sycl::aspect::fp16});
411
 
412
- stream->submit([&](sycl::handler &cgh) {
413
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
414
- sycl::range<3>(1, 1, 32),
415
- sycl::range<3>(1, 1, 32)),
416
- [=](sycl::nd_item<3> item_ct1) {
417
- dequantize_block_iq3_s(
418
- vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
419
- });
420
  });
421
  }
422
  }
@@ -432,14 +371,11 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k
432
  dpct::has_capability_or_fail(stream->get_device(),
433
  {sycl::aspect::fp16});
434
 
435
- stream->submit([&](sycl::handler &cgh) {
436
- cgh.parallel_for(
437
- sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
438
- sycl::range<3>(1, 1, 32),
439
- sycl::range<3>(1, 1, 32)),
440
- [=](sycl::nd_item<3> item_ct1) {
441
- dequantize_block_iq4_xs(vx, y, item_ct1);
442
- });
443
  });
444
  }
445
  #endif
@@ -453,14 +389,11 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
453
  dpct::has_capability_or_fail(stream->get_device(),
454
  {sycl::aspect::fp16});
455
 
456
- stream->submit([&](sycl::handler &cgh) {
457
- cgh.parallel_for(
458
- sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
459
- sycl::range<3>(1, 1, 32),
460
- sycl::range<3>(1, 1, 32)),
461
- [=](sycl::nd_item<3> item_ct1) {
462
- dequantize_block_iq4_nl(vx, y, item_ct1);
463
- });
464
  });
465
  }
466
  }
 
33
  {
34
  dpct::has_capability_or_fail(stream->get_device(),
35
  {sycl::aspect::fp16});
36
+ sycl_parallel_for(
37
+ stream,
38
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
39
+ sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
40
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1); });
 
 
 
41
  }
42
  }
43
 
 
50
  dpct::has_capability_or_fail(stream->get_device(),
51
  {sycl::aspect::fp16});
52
 
53
+ sycl_parallel_for(
54
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
55
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
 
 
 
56
  }
57
  #else
58
  {
59
  dpct::has_capability_or_fail(stream->get_device(),
60
  {sycl::aspect::fp16});
61
 
62
+ sycl_parallel_for(
63
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
64
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
 
 
 
65
  }
66
 
67
  #endif
 
76
  dpct::has_capability_or_fail(stream->get_device(),
77
  {sycl::aspect::fp16});
78
 
79
+ sycl_parallel_for(
80
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
81
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
 
 
 
82
  }
83
  #else
84
  {
85
  dpct::has_capability_or_fail(stream->get_device(),
86
  {sycl::aspect::fp16});
87
 
88
+ sycl_parallel_for(
89
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
90
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
 
 
 
91
  }
92
  #endif
93
  }
 
101
  dpct::has_capability_or_fail(stream->get_device(),
102
  {sycl::aspect::fp16});
103
 
104
+ sycl_parallel_for(
105
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
106
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_0(vx, y, nb32, item_ct1); });
 
 
 
107
  }
108
  }
109
 
 
117
  int constexpr WARP_K = WARP_SIZE * QK4_0;
118
  const int n_warp = (k + WARP_K - 1) / WARP_K;
119
  GGML_ASSERT(k % 2 == 0);
120
+ sycl_parallel_for(stream,
121
+ sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * sycl::range<3>(1, 1, WARP_SIZE),
122
+ sycl::range<3>(1, 1, WARP_SIZE)),
123
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
124
+ dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
125
+ });
 
126
  }
127
 
128
  template <typename dst_t>
 
134
  dpct::has_capability_or_fail(stream->get_device(),
135
  {sycl::aspect::fp16});
136
 
137
+ sycl_parallel_for(
138
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
139
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_1(vx, y, nb32, item_ct1); });
 
 
 
140
  }
141
  }
142
 
 
149
  dpct::has_capability_or_fail(stream->get_device(),
150
  {sycl::aspect::fp16});
151
 
152
+ sycl_launch(stream, [&](sycl::handler & cgh) {
153
  sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
154
+ sycl_parallel_for(
155
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
156
+ [=](sycl::nd_item<3> item_ct1) {
157
+ dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
158
+ });
 
159
  });
160
  }
161
  }
 
168
 
169
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
170
 
171
+ sycl_launch(stream, [&](sycl::handler & cgh) {
172
  sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
173
 
174
+ sycl_parallel_for<1>(cgh, sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
175
+ [=](sycl::nd_item<1> item_ct1) {
176
+ dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
177
+ });
178
  });
179
  }
180
 
 
187
  dpct::has_capability_or_fail(stream->get_device(),
188
  {sycl::aspect::fp16});
189
 
190
+ sycl_parallel_for(
191
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
192
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
 
 
 
193
  }
194
  #else
195
  {
196
  dpct::has_capability_or_fail(stream->get_device(),
197
  {sycl::aspect::fp16});
198
 
199
+ sycl_parallel_for(
200
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
201
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
 
 
 
202
  }
203
 
204
  #endif
 
213
  dpct::has_capability_or_fail(stream->get_device(),
214
  {sycl::aspect::fp16});
215
 
216
+ sycl_parallel_for(
217
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
218
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
 
 
 
219
  }
220
  #else
221
  {
222
  dpct::has_capability_or_fail(stream->get_device(),
223
  {sycl::aspect::fp16});
224
 
225
+ sycl_parallel_for(
226
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
227
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
 
 
 
228
  }
229
 
230
  #endif
 
236
 
237
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
238
 
239
+ sycl_parallel_for(stream,
240
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
241
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
242
  }
243
 
244
  template <typename dst_t>
 
249
  dpct::has_capability_or_fail(stream->get_device(),
250
  {sycl::aspect::fp16});
251
 
252
+ sycl_launch(stream, [&](sycl::handler & cgh) {
253
+ sycl_parallel_for(
254
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
255
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_s(vx, y, item_ct1, iq1s_grid_gpu); });
 
 
 
 
 
256
  });
257
  }
258
  }
 
265
  dpct::has_capability_or_fail(stream->get_device(),
266
  {sycl::aspect::fp16});
267
 
268
+ sycl_launch(stream, [&](sycl::handler & cgh) {
269
+ sycl_parallel_for(
270
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
271
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_m(vx, y, item_ct1, iq1s_grid_gpu); });
 
 
 
 
 
272
  });
273
  }
274
  }
 
281
  dpct::has_capability_or_fail(stream->get_device(),
282
  {sycl::aspect::fp16});
283
 
284
+ sycl_launch(stream, [&](sycl::handler & cgh) {
285
+ sycl_parallel_for(
286
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
287
+ [=](sycl::nd_item<3> item_ct1) {
288
+ dequantize_block_iq2_xxs(vx, y, item_ct1, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
289
+ });
 
 
 
290
  });
291
  }
292
  }
 
299
  dpct::has_capability_or_fail(stream->get_device(),
300
  {sycl::aspect::fp16});
301
 
302
+ sycl_launch(stream, [&](sycl::handler & cgh) {
303
+ sycl_parallel_for(
304
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
305
+ [=](sycl::nd_item<3> item_ct1) {
306
+ dequantize_block_iq2_xs(vx, y, item_ct1, iq2xs_grid, ksigns_iq2xs, kmask_iq2xs);
307
+ });
 
 
 
308
  });
309
  }
310
  }
 
317
  dpct::has_capability_or_fail(stream->get_device(),
318
  {sycl::aspect::fp16});
319
 
320
+ sycl_launch(stream, [&](sycl::handler & cgh) {
321
+ sycl_parallel_for(
322
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
323
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_s(vx, y, item_ct1); });
 
 
 
324
  });
325
  }
326
  }
 
334
  dpct::has_capability_or_fail(stream->get_device(),
335
  {sycl::aspect::fp16});
336
 
337
+ sycl_launch(stream, [&](sycl::handler & cgh) {
338
+ sycl_parallel_for(
339
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
340
+ [=](sycl::nd_item<3> item_ct1) {
341
+ dequantize_block_iq3_xxs(vx, y, item_ct1, iq3xxs_grid, ksigns_iq2xs, kmask_iq2xs);
342
+ });
 
 
 
343
  });
344
  }
345
  }
 
352
  dpct::has_capability_or_fail(stream->get_device(),
353
  {sycl::aspect::fp16});
354
 
355
+ sycl_launch(stream, [&](sycl::handler & cgh) {
356
+ sycl_parallel_for(
357
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
358
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_s(vx, y, item_ct1, kmask_iq2xs, iq3s_grid); });
 
 
 
 
359
  });
360
  }
361
  }
 
371
  dpct::has_capability_or_fail(stream->get_device(),
372
  {sycl::aspect::fp16});
373
 
374
+ sycl_launch(stream, [&](sycl::handler & cgh) {
375
+ sycl_parallel_for(
376
+ cgh,
377
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
378
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_xs(vx, y, item_ct1); });
 
 
 
379
  });
380
  }
381
  #endif
 
389
  dpct::has_capability_or_fail(stream->get_device(),
390
  {sycl::aspect::fp16});
391
 
392
+ sycl_launch(stream, [&](sycl::handler & cgh) {
393
+ sycl_parallel_for(
394
+ cgh,
395
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
396
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_nl(vx, y, item_ct1); });
 
 
 
397
  });
398
  }
399
  }
ggml/src/ggml-sycl/cpy.cpp CHANGED
@@ -413,7 +413,8 @@ static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, co
413
  {
414
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
415
 
416
- stream->parallel_for(
 
417
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
418
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
419
  [=](sycl::nd_item<3> item_ct1) {
@@ -431,7 +432,8 @@ static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, co
431
  {
432
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
433
 
434
- stream->parallel_for(
 
435
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
436
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
437
  [=](sycl::nd_item<3> item_ct1) {
@@ -449,7 +451,8 @@ static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, co
449
  {
450
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
451
 
452
- stream->parallel_for(
 
453
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
454
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
455
  [=](sycl::nd_item<3> item_ct1) {
@@ -465,11 +468,11 @@ static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, c
465
  const int nb12, const int nb13, queue_ptr stream) {
466
  GGML_ASSERT(ne % QK8_0 == 0);
467
  const int num_blocks = ne / QK8_0;
468
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
469
- [=](sycl::nd_item<3> item_ct1) {
470
- cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
471
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
472
- });
473
  }
474
 
475
  static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -477,11 +480,11 @@ static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, c
477
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
478
  const int nb12, const int nb13, queue_ptr stream) {
479
  const int num_blocks = ne;
480
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
481
- [=](sycl::nd_item<3> item_ct1) {
482
- cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
483
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
484
- });
485
  }
486
 
487
  static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -490,11 +493,11 @@ static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, c
490
  const int nb12, const int nb13, queue_ptr stream) {
491
  GGML_ASSERT(ne % QK4_0 == 0);
492
  const int num_blocks = ne / QK4_0;
493
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
494
- [=](sycl::nd_item<3> item_ct1) {
495
- cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
496
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
497
- });
498
  }
499
 
500
  static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -502,8 +505,9 @@ static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, c
502
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
503
  const int nb12, const int nb13, queue_ptr stream) {
504
  const int num_blocks = ne;
505
- stream->parallel_for(
506
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
 
507
  cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
508
  nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
509
  item_ct1);
@@ -516,11 +520,11 @@ static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, c
516
  const int nb12, const int nb13, queue_ptr stream) {
517
  GGML_ASSERT(ne % QK4_1 == 0);
518
  const int num_blocks = ne / QK4_1;
519
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
520
- [=](sycl::nd_item<3> item_ct1) {
521
- cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
522
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
523
- });
524
  }
525
 
526
  static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -528,8 +532,9 @@ static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, c
528
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
529
  const int nb12, const int nb13, queue_ptr stream) {
530
  const int num_blocks = ne;
531
- stream->parallel_for(
532
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
 
533
  cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
534
  nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
535
  item_ct1);
@@ -542,11 +547,11 @@ static void ggml_cpy_f32_q5_0_sycl(const char * cx, char * cdst, const int ne, c
542
  const int nb12, const int nb13, queue_ptr stream) {
543
  GGML_ASSERT(ne % QK5_0 == 0);
544
  const int num_blocks = ne / QK5_0;
545
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
546
- [=](sycl::nd_item<3> item_ct1) {
547
- cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
548
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
549
- });
550
  }
551
 
552
  static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -554,8 +559,9 @@ static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, c
554
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
555
  const int nb12, const int nb13, queue_ptr stream) {
556
  const int num_blocks = ne;
557
- stream->parallel_for(
558
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
 
559
  cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
560
  nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
561
  item_ct1);
@@ -568,11 +574,11 @@ static void ggml_cpy_f32_q5_1_sycl(const char * cx, char * cdst, const int ne, c
568
  const int nb12, const int nb13, queue_ptr stream) {
569
  GGML_ASSERT(ne % QK5_1 == 0);
570
  const int num_blocks = ne / QK5_1;
571
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
572
- [=](sycl::nd_item<3> item_ct1) {
573
- cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
574
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
575
- });
576
  }
577
 
578
  static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -580,8 +586,9 @@ static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, c
580
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
581
  const int nb12, const int nb13, queue_ptr stream) {
582
  const int num_blocks = ne;
583
- stream->parallel_for(
584
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
 
585
  cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
586
  nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
587
  item_ct1);
@@ -594,11 +601,11 @@ static void ggml_cpy_f32_iq4_nl_sycl(const char * cx, char * cdst, const int ne,
594
  const int nb12, const int nb13, queue_ptr stream) {
595
  GGML_ASSERT(ne % QK4_NL == 0);
596
  const int num_blocks = ne / QK4_NL;
597
- stream->parallel_for(
598
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
599
- cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
600
- ne12, nb10, nb11, nb12, nb13, item_ct1);
601
- });
602
  }
603
 
604
  static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -609,7 +616,8 @@ static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, co
609
  {
610
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
611
 
612
- stream->parallel_for(
 
613
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
614
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
615
  [=](sycl::nd_item<3> item_ct1) {
@@ -628,7 +636,8 @@ static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, co
628
  // dpct::has_capability_or_fail(stream->get_device(),
629
  // {sycl::aspect::fp16});
630
 
631
- stream->parallel_for(
 
632
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
633
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
634
  [=](sycl::nd_item<3> item_ct1) {
@@ -647,7 +656,8 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
647
  // dpct::has_capability_or_fail(stream->get_device(),
648
  // {sycl::aspect::fp16});
649
 
650
- stream->parallel_for(
 
651
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
652
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
653
  [=](sycl::nd_item<3> item_ct1) {
@@ -662,11 +672,13 @@ static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const
662
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
663
  const int nb12, const int nb13, queue_ptr stream) {
664
  const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
665
- stream->parallel_for(
666
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
667
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
668
- cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
669
- });
 
 
670
  }
671
 
672
 
@@ -675,11 +687,13 @@ static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const
675
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
676
  const int nb12, const int nb13, queue_ptr stream) {
677
  const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
678
- stream->parallel_for(
679
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
680
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
681
- cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
682
- });
 
 
683
  }
684
 
685
 
@@ -689,11 +703,13 @@ static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const
689
  const int nb12, const int nb13, queue_ptr stream) {
690
  const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
691
 
692
- stream->parallel_for(
693
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
694
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
695
- cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
696
- });
 
 
697
  }
698
 
699
 
@@ -702,10 +718,13 @@ static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const
702
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
703
  const int nb12, const int nb13, queue_ptr stream) {
704
  const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
705
- stream->parallel_for(
706
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
707
- cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
708
- });
 
 
 
709
  }
710
 
711
 
@@ -715,10 +734,13 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const
715
  const int nb12, const int nb13, queue_ptr stream) {
716
 
717
  const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
718
- stream->parallel_for(
719
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
720
- cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
721
- });
 
 
 
722
  }
723
 
724
  void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
 
413
  {
414
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
415
 
416
+ sycl_parallel_for(
417
+ stream,
418
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
419
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
420
  [=](sycl::nd_item<3> item_ct1) {
 
432
  {
433
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
434
 
435
+ sycl_parallel_for(
436
+ stream,
437
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
438
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
439
  [=](sycl::nd_item<3> item_ct1) {
 
451
  {
452
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
453
 
454
+ sycl_parallel_for(
455
+ stream,
456
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
457
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
458
  [=](sycl::nd_item<3> item_ct1) {
 
468
  const int nb12, const int nb13, queue_ptr stream) {
469
  GGML_ASSERT(ne % QK8_0 == 0);
470
  const int num_blocks = ne / QK8_0;
471
+ sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
472
+ [=](sycl::nd_item<3> item_ct1) {
473
+ cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
474
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
475
+ });
476
  }
477
 
478
  static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
 
480
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
481
  const int nb12, const int nb13, queue_ptr stream) {
482
  const int num_blocks = ne;
483
+ sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
484
+ [=](sycl::nd_item<3> item_ct1) {
485
+ cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
486
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
487
+ });
488
  }
489
 
490
  static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
 
493
  const int nb12, const int nb13, queue_ptr stream) {
494
  GGML_ASSERT(ne % QK4_0 == 0);
495
  const int num_blocks = ne / QK4_0;
496
+ sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
497
+ [=](sycl::nd_item<3> item_ct1) {
498
+ cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
499
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
500
+ });
501
  }
502
 
503
  static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
 
505
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
506
  const int nb12, const int nb13, queue_ptr stream) {
507
  const int num_blocks = ne;
508
+ sycl_parallel_for(
509
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
510
+ [=](sycl::nd_item<3> item_ct1) {
511
  cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
512
  nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
513
  item_ct1);
 
520
  const int nb12, const int nb13, queue_ptr stream) {
521
  GGML_ASSERT(ne % QK4_1 == 0);
522
  const int num_blocks = ne / QK4_1;
523
+ sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
524
+ [=](sycl::nd_item<3> item_ct1) {
525
+ cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
526
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
527
+ });
528
  }
529
 
530
  static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
 
532
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
533
  const int nb12, const int nb13, queue_ptr stream) {
534
  const int num_blocks = ne;
535
+ sycl_parallel_for(
536
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
537
+ [=](sycl::nd_item<3> item_ct1) {
538
  cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
539
  nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
540
  item_ct1);
 
547
  const int nb12, const int nb13, queue_ptr stream) {
548
  GGML_ASSERT(ne % QK5_0 == 0);
549
  const int num_blocks = ne / QK5_0;
550
+ sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
551
+ [=](sycl::nd_item<3> item_ct1) {
552
+ cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
553
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
554
+ });
555
  }
556
 
557
  static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
 
559
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
560
  const int nb12, const int nb13, queue_ptr stream) {
561
  const int num_blocks = ne;
562
+ sycl_parallel_for(
563
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
564
+ [=](sycl::nd_item<3> item_ct1) {
565
  cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
566
  nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
567
  item_ct1);
 
574
  const int nb12, const int nb13, queue_ptr stream) {
575
  GGML_ASSERT(ne % QK5_1 == 0);
576
  const int num_blocks = ne / QK5_1;
577
+ sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
578
+ [=](sycl::nd_item<3> item_ct1) {
579
+ cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
580
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
581
+ });
582
  }
583
 
584
  static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
 
586
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
587
  const int nb12, const int nb13, queue_ptr stream) {
588
  const int num_blocks = ne;
589
+ sycl_parallel_for(
590
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
591
+ [=](sycl::nd_item<3> item_ct1) {
592
  cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
593
  nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
594
  item_ct1);
 
601
  const int nb12, const int nb13, queue_ptr stream) {
602
  GGML_ASSERT(ne % QK4_NL == 0);
603
  const int num_blocks = ne / QK4_NL;
604
+ sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
605
+ [=](sycl::nd_item<3> item_ct1) {
606
+ cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
607
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
608
+ });
609
  }
610
 
611
  static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
 
616
  {
617
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
618
 
619
+ sycl_parallel_for(
620
+ stream,
621
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
622
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
623
  [=](sycl::nd_item<3> item_ct1) {
 
636
  // dpct::has_capability_or_fail(stream->get_device(),
637
  // {sycl::aspect::fp16});
638
 
639
+ sycl_parallel_for(
640
+ stream,
641
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
642
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
643
  [=](sycl::nd_item<3> item_ct1) {
 
656
  // dpct::has_capability_or_fail(stream->get_device(),
657
  // {sycl::aspect::fp16});
658
 
659
+ sycl_parallel_for(
660
+ stream,
661
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
662
  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
663
  [=](sycl::nd_item<3> item_ct1) {
 
672
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
673
  const int nb12, const int nb13, queue_ptr stream) {
674
  const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
675
+ sycl_parallel_for(stream,
676
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
677
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
678
+ [=](sycl::nd_item<3> item_ct1) {
679
+ cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
680
+ ne12, nb10, nb11, nb12, nb13, item_ct1);
681
+ });
682
  }
683
 
684
 
 
687
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
688
  const int nb12, const int nb13, queue_ptr stream) {
689
  const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
690
+ sycl_parallel_for(stream,
691
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
692
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
693
+ [=](sycl::nd_item<3> item_ct1) {
694
+ cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
695
+ ne12, nb10, nb11, nb12, nb13, item_ct1);
696
+ });
697
  }
698
 
699
 
 
703
  const int nb12, const int nb13, queue_ptr stream) {
704
  const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
705
 
706
+ sycl_parallel_for(stream,
707
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
708
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
709
+ [=](sycl::nd_item<3> item_ct1) {
710
+ cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
711
+ ne12, nb10, nb11, nb12, nb13, item_ct1);
712
+ });
713
  }
714
 
715
 
 
718
  const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
719
  const int nb12, const int nb13, queue_ptr stream) {
720
  const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
721
+ sycl_parallel_for(stream,
722
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
723
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
724
+ [=](sycl::nd_item<3> item_ct1) {
725
+ cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
726
+ ne12, nb10, nb11, nb12, nb13, item_ct1);
727
+ });
728
  }
729
 
730
 
 
734
  const int nb12, const int nb13, queue_ptr stream) {
735
 
736
  const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
737
+ sycl_parallel_for(stream,
738
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
739
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
740
+ [=](sycl::nd_item<3> item_ct1) {
741
+ cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
742
+ ne12, nb10, nb11, nb12, nb13, item_ct1);
743
+ });
744
  }
745
 
746
  void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
ggml/src/ggml-sycl/dmmv.cpp CHANGED
@@ -208,12 +208,10 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
208
  dpct::has_capability_or_fail(stream->get_device(),
209
  {sycl::aspect::fp16});
210
 
211
- stream->parallel_for(
212
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
213
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
214
- dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
215
- nrows, item_ct1);
216
- });
217
  }
218
  }
219
 
@@ -877,12 +875,11 @@ static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloa
877
  dpct::has_capability_or_fail(stream->get_device(),
878
  {sycl::aspect::fp16});
879
 
880
- stream->parallel_for(
881
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
882
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
883
- dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
884
- vx, y, dst, ncols, nrows, item_ct1);
885
- });
886
  }
887
  }
888
 
@@ -900,12 +897,10 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
900
  dpct::has_capability_or_fail(stream->get_device(),
901
  {sycl::aspect::fp16});
902
 
903
- stream->parallel_for(
904
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
905
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
906
- dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
907
- vx, y, dst, ncols, nrows, item_ct1);
908
- });
909
  }
910
  }
911
 
@@ -921,12 +916,10 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
921
  dpct::has_capability_or_fail(stream->get_device(),
922
  {sycl::aspect::fp16});
923
 
924
- stream->parallel_for(
925
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
926
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
927
- dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
928
- vx, y, dst, ncols, nrows, item_ct1);
929
- });
930
  }
931
  }
932
 
@@ -942,12 +935,10 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
942
  dpct::has_capability_or_fail(stream->get_device(),
943
  {sycl::aspect::fp16});
944
 
945
- stream->parallel_for(
946
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
947
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
948
- dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
949
- vx, y, dst, ncols, nrows, item_ct1);
950
- });
951
  }
952
  }
953
 
@@ -963,12 +954,10 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
963
  dpct::has_capability_or_fail(stream->get_device(),
964
  {sycl::aspect::fp16});
965
 
966
- stream->parallel_for(
967
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
968
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
969
- dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
970
- vx, y, dst, ncols, nrows, item_ct1);
971
- });
972
  }
973
  }
974
 
@@ -984,12 +973,10 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
984
  dpct::has_capability_or_fail(stream->get_device(),
985
  {sycl::aspect::fp16});
986
 
987
- stream->parallel_for(
988
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
989
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
990
- dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
991
- vx, y, dst, ncols, nrows, item_ct1);
992
- });
993
  }
994
  }
995
 
@@ -1002,11 +989,10 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
1002
  const int block_num_y = (nrows + ny - 1) / ny;
1003
  const sycl::range<3> block_nums(1, 1, block_num_y);
1004
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1005
- stream->parallel_for(
1006
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1007
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1008
- dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
1009
- });
1010
  }
1011
 
1012
  static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
@@ -1018,11 +1004,10 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
1018
  const int block_num_y = (nrows + ny - 1) / ny;
1019
  const sycl::range<3> block_nums(1, 1, block_num_y);
1020
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1021
- stream->parallel_for(
1022
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1023
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1024
- dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
1025
- });
1026
  }
1027
 
1028
  static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
@@ -1034,11 +1019,10 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
1034
  const int block_num_y = (nrows + ny - 1) / ny;
1035
  const sycl::range<3> block_nums(1, 1, block_num_y);
1036
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1037
- stream->parallel_for(
1038
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1039
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1040
- dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
1041
- });
1042
  }
1043
 
1044
  static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
@@ -1047,11 +1031,10 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
1047
  dpct::queue_ptr stream) {
1048
  GGML_ASSERT(ncols % QK_K == 0);
1049
  const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
1050
- stream->parallel_for(
1051
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
1052
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1053
- dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
1054
- });
1055
  }
1056
 
1057
  static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
@@ -1063,11 +1046,10 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
1063
  const int block_num_y = (nrows + ny - 1) / ny;
1064
  const sycl::range<3> block_nums(1, 1, block_num_y);
1065
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1066
- stream->parallel_for(
1067
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1068
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1069
- dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
1070
- });
1071
  }
1072
 
1073
  void ggml_sycl_op_dequantize_mul_mat_vec(
 
208
  dpct::has_capability_or_fail(stream->get_device(),
209
  {sycl::aspect::fp16});
210
 
211
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
212
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
213
+ dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, nrows, item_ct1);
214
+ });
 
 
215
  }
216
  }
217
 
 
875
  dpct::has_capability_or_fail(stream->get_device(),
876
  {sycl::aspect::fp16});
877
 
878
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
879
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
880
+ dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(vx, y, dst, ncols,
881
+ nrows, item_ct1);
882
+ });
 
883
  }
884
  }
885
 
 
897
  dpct::has_capability_or_fail(stream->get_device(),
898
  {sycl::aspect::fp16});
899
 
900
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
901
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
902
+ dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(vx, y, dst, ncols, nrows, item_ct1);
903
+ });
 
 
904
  }
905
  }
906
 
 
916
  dpct::has_capability_or_fail(stream->get_device(),
917
  {sycl::aspect::fp16});
918
 
919
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
920
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
921
+ dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(vx, y, dst, ncols, nrows, item_ct1);
922
+ });
 
 
923
  }
924
  }
925
 
 
935
  dpct::has_capability_or_fail(stream->get_device(),
936
  {sycl::aspect::fp16});
937
 
938
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
939
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
940
+ dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(vx, y, dst, ncols, nrows, item_ct1);
941
+ });
 
 
942
  }
943
  }
944
 
 
954
  dpct::has_capability_or_fail(stream->get_device(),
955
  {sycl::aspect::fp16});
956
 
957
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
958
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
959
+ dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(vx, y, dst, ncols, nrows, item_ct1);
960
+ });
 
 
961
  }
962
  }
963
 
 
973
  dpct::has_capability_or_fail(stream->get_device(),
974
  {sycl::aspect::fp16});
975
 
976
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
977
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
978
+ dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(vx, y, dst, ncols, nrows, item_ct1);
979
+ });
 
 
980
  }
981
  }
982
 
 
989
  const int block_num_y = (nrows + ny - 1) / ny;
990
  const sycl::range<3> block_nums(1, 1, block_num_y);
991
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
992
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
993
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
994
+ dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
995
+ });
 
996
  }
997
 
998
  static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
 
1004
  const int block_num_y = (nrows + ny - 1) / ny;
1005
  const sycl::range<3> block_nums(1, 1, block_num_y);
1006
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1007
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
1008
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1009
+ dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
1010
+ });
 
1011
  }
1012
 
1013
  static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
 
1019
  const int block_num_y = (nrows + ny - 1) / ny;
1020
  const sycl::range<3> block_nums(1, 1, block_num_y);
1021
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1022
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
1023
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1024
+ dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
1025
+ });
 
1026
  }
1027
 
1028
  static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
 
1031
  dpct::queue_ptr stream) {
1032
  GGML_ASSERT(ncols % QK_K == 0);
1033
  const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
1034
+ sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
1035
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1036
+ dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
1037
+ });
 
1038
  }
1039
 
1040
  static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
 
1046
  const int block_num_y = (nrows + ny - 1) / ny;
1047
  const sycl::range<3> block_nums(1, 1, block_num_y);
1048
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1049
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
1050
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1051
+ dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
1052
+ });
 
1053
  }
1054
 
1055
  void ggml_sycl_op_dequantize_mul_mat_vec(
ggml/src/ggml-sycl/dpct/helper.hpp CHANGED
@@ -13,10 +13,10 @@
13
  #ifndef GGML_SYCL_DPCT_HELPER_HPP
14
  #define GGML_SYCL_DPCT_HELPER_HPP
15
 
 
16
  #include <sycl/sycl.hpp>
17
  #include <sycl/half_type.hpp>
18
  #include <syclcompat/math.hpp>
19
- #include <map>
20
 
21
  #ifdef GGML_SYCL_USE_INTEL_ONEMKL
22
  #include <oneapi/mkl.hpp>
@@ -118,6 +118,36 @@ inline auto get_onemath_backend(sycl::queue& queue)
118
  #endif
119
  }
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  namespace dpct
122
  {
123
  typedef sycl::queue *queue_ptr;
 
13
  #ifndef GGML_SYCL_DPCT_HELPER_HPP
14
  #define GGML_SYCL_DPCT_HELPER_HPP
15
 
16
+ #include <map>
17
  #include <sycl/sycl.hpp>
18
  #include <sycl/half_type.hpp>
19
  #include <syclcompat/math.hpp>
 
20
 
21
  #ifdef GGML_SYCL_USE_INTEL_ONEMKL
22
  #include <oneapi/mkl.hpp>
 
118
  #endif
119
  }
120
 
121
+ #ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
122
+ namespace syclex = sycl::ext::oneapi::experimental;
123
+ #endif
124
+
125
+ template <int NR, typename Func>
126
+ __dpct_inline__ void sycl_parallel_for(sycl::handler & cgh, sycl::nd_range<NR> nd_range, Func && func) {
127
+ #ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
128
+ syclex::nd_launch(cgh, nd_range, func);
129
+ #else
130
+ cgh.parallel_for(nd_range, func);
131
+ #endif
132
+ }
133
+
134
+ template <int NR, typename Func>
135
+ __dpct_inline__ void sycl_parallel_for(sycl::queue * q, sycl::nd_range<NR> nd_range, Func && func) {
136
+ #ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
137
+ syclex::nd_launch(*q, nd_range, func);
138
+ #else
139
+ q->parallel_for(nd_range, func);
140
+ #endif
141
+ }
142
+
143
+ template <typename Func> __dpct_inline__ void sycl_launch(sycl::queue * stream, Func && func) {
144
+ #ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
145
+ syclex::submit(*stream, func);
146
+ #else
147
+ stream->submit(func);
148
+ #endif
149
+ }
150
+
151
  namespace dpct
152
  {
153
  typedef sycl::queue *queue_ptr;
ggml/src/ggml-sycl/element_wise.cpp CHANGED
@@ -329,60 +329,51 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
329
  const int ne12, const int nb1, const int nb2,
330
  const int offset, queue_ptr stream) {
331
  int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
332
- stream->parallel_for(
333
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
334
- sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
335
- sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
336
- [=](sycl::nd_item<3> item_ct1) {
337
- acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
338
- item_ct1);
339
- });
340
  }
341
 
342
  template<typename T>
343
  static void gelu_sycl(const T *x, T *dst, const int k,
344
  queue_ptr stream) {
345
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
346
- stream->parallel_for(
347
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
348
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
349
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
350
- [=](sycl::nd_item<3> item_ct1) {
351
- gelu(x, dst, k, item_ct1);
352
- });
353
  }
354
 
355
  template<typename T>
356
  static void silu_sycl(const T *x, T *dst, const int k,
357
  queue_ptr stream) {
358
  const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
359
- stream->parallel_for(
360
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
361
- sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
362
- sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
363
- [=](sycl::nd_item<3> item_ct1) {
364
- silu(x, dst, k, item_ct1);
365
- });
366
  }
367
 
368
  template<typename T>
369
  static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
370
  // hard code for now
371
  const int num_blocks = ceil_div(k, 256);
372
- stream->parallel_for(
373
- sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
374
- sgn(x, dst, k, item_ct1);
375
- });
376
  }
377
 
378
  template<typename T>
379
  static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
380
  // hard code for now
381
  const int num_blocks = ceil_div(k, 256);
382
- stream->parallel_for(
383
- sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
384
- abs_op(x, dst, k, item_ct1);
385
- });
386
  }
387
 
388
 
@@ -390,23 +381,20 @@ template<typename T>
390
  static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
391
  // hard code for now
392
  const int num_blocks = ceil_div(k, 256);
393
- stream->parallel_for(
394
- sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
395
- elu_op(x, dst, k, item_ct1);
396
- });
397
  }
398
 
399
  template<typename T>
400
  static void gelu_quick_sycl(const T *x, T *dst, const int k,
401
  queue_ptr stream) {
402
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
403
- stream->parallel_for(
404
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
405
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
406
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
407
- [=](sycl::nd_item<3> item_ct1) {
408
- gelu_quick(x, dst, k, item_ct1);
409
- });
410
  }
411
 
412
 
@@ -414,169 +402,133 @@ template<typename T>
414
  static void gelu_erf_sycl(const T *x, T *dst, const int k,
415
  queue_ptr stream) {
416
  const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
417
- stream->parallel_for(
418
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
419
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
420
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
421
- [=](sycl::nd_item<3> item_ct1) {
422
- gelu_erf(x, dst, k, item_ct1);
423
- });
424
  }
425
 
426
  template<typename T>
427
  static void tanh_sycl(const T *x, T *dst, const int k,
428
  queue_ptr stream) {
429
  const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
430
- stream->parallel_for(
431
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
432
- sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
433
- sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
434
- [=](sycl::nd_item<3> item_ct1) {
435
- tanh(x, dst, k, item_ct1);
436
- });
437
  }
438
 
439
  template<typename T>
440
  static void relu_sycl(const T *x, T *dst, const int k,
441
  queue_ptr stream) {
442
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
443
- stream->parallel_for(
444
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
445
- sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
446
- sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
447
- [=](sycl::nd_item<3> item_ct1) {
448
- relu(x, dst, k, item_ct1);
449
- });
450
  }
451
 
452
  template<typename T>
453
  static void hardsigmoid_sycl(const T *x, T *dst, const int k,
454
  queue_ptr stream) {
455
  const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
456
- stream->parallel_for(
457
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
458
- sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
459
  sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
460
- [=](sycl::nd_item<3> item_ct1) {
461
- hardsigmoid(x, dst, k, item_ct1);
462
- });
463
  }
464
 
465
  template<typename T>
466
  static void hardswish_sycl(const T *x, T *dst, const int k,
467
  queue_ptr stream) {
468
  const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
469
- stream->parallel_for(
470
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
471
- sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
472
  sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
473
- [=](sycl::nd_item<3> item_ct1) {
474
- hardswish(x, dst, k, item_ct1);
475
- });
476
  }
477
 
478
  template<typename T>
479
  static void exp_sycl(const T *x, T *dst, const int k,
480
  queue_ptr stream) {
481
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
482
- stream->parallel_for(
483
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
484
- sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
485
- sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
486
- [=](sycl::nd_item<3> item_ct1) {
487
- exp(x, dst, k, item_ct1);
488
- });
489
  }
490
 
491
  template<typename T>
492
  static void log_sycl(const T *x, T *dst, const int k,
493
  queue_ptr stream) {
494
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
495
- stream->parallel_for(
496
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
497
- sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
498
- sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
499
- [=](sycl::nd_item<3> item_ct1) {
500
- log(x, dst, k, item_ct1);
501
- });
502
  }
503
 
504
  template<typename T>
505
  static void neg_sycl(const T *x, T *dst, const int k,
506
  queue_ptr stream) {
507
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
508
- stream->parallel_for(
509
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
510
- sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
511
- sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
512
- [=](sycl::nd_item<3> item_ct1) {
513
- neg(x, dst, k, item_ct1);
514
- });
515
  }
516
 
517
  template<typename T>
518
  static void step_sycl(const T *x, T *dst, const int k,
519
  queue_ptr stream) {
520
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
521
- stream->parallel_for(
522
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
523
- sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
524
- sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
525
- [=](sycl::nd_item<3> item_ct1) {
526
- step(x, dst, k, item_ct1);
527
- });
528
  }
529
 
530
  template<typename T>
531
  static void sigmoid_sycl(const T *x, T *dst, const int k,
532
  queue_ptr stream) {
533
  const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
534
- stream->parallel_for(
535
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
536
- sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
537
  sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
538
- [=](sycl::nd_item<3> item_ct1) {
539
- sigmoid(x, dst, k, item_ct1);
540
- });
541
  }
542
 
543
  template<typename T>
544
  static void sqrt_sycl(const T *x, T *dst, const int k,
545
  queue_ptr stream) {
546
  const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
547
- stream->parallel_for(
548
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
549
- sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
550
- sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
551
- [=](sycl::nd_item<3> item_ct1) {
552
- sqrt(x, dst, k, item_ct1);
553
- });
554
  }
555
 
556
  template<typename T>
557
  static void sin_sycl(const T *x, T *dst, const int k,
558
  queue_ptr stream) {
559
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
560
- stream->parallel_for(
561
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
562
- sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
563
- sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
564
- [=](sycl::nd_item<3> item_ct1) {
565
- sin(x, dst, k, item_ct1);
566
- });
567
  }
568
 
569
  template<typename T>
570
  static void cos_sycl(const T *x, T *dst, const int k,
571
  queue_ptr stream) {
572
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
573
- stream->parallel_for(
574
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
575
- sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
576
- sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
577
- [=](sycl::nd_item<3> item_ct1) {
578
- cos(x, dst, k, item_ct1);
579
- });
580
  }
581
 
582
  template<typename T>
@@ -584,26 +536,20 @@ static void leaky_relu_sycl(const T *x, T *dst, const int k,
584
  const float negative_slope,
585
  queue_ptr stream) {
586
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
587
- stream->parallel_for(
588
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
589
- sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
590
- sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
591
- [=](sycl::nd_item<3> item_ct1) {
592
- leaky_relu(x, dst, k, negative_slope, item_ct1);
593
- });
594
  }
595
 
596
  template<typename T>
597
  static void sqr_sycl(const T *x, T *dst, const int k,
598
  queue_ptr stream) {
599
  const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
600
- stream->parallel_for(
601
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
602
- sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
603
- sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
604
- [=](sycl::nd_item<3> item_ct1) {
605
- sqr(x, dst, k, item_ct1);
606
- });
607
  }
608
 
609
  template<typename T>
@@ -614,9 +560,8 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
614
  int dst_size = ne10 * ne11 * ne12 * ne13;
615
  int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
616
  sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
617
- stream->parallel_for(
618
- sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
619
- [=](sycl::nd_item<1> item_ct1) {
620
  upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
621
  });
622
  }
@@ -627,12 +572,10 @@ static void pad_sycl(const T *x, T *dst, const int ne00,
627
  const int ne1, const int ne2, queue_ptr stream) {
628
  int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
629
  sycl::range<3> gridDim(ne2, ne1, num_blocks);
630
- stream->parallel_for(
631
- sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
632
- sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
633
- [=](sycl::nd_item<3> item_ct1) {
634
- pad(x, dst, ne0, ne00, ne01, ne02, item_ct1);
635
- });
636
  }
637
 
638
  template<typename T>
@@ -640,13 +583,10 @@ static void clamp_sycl(const T *x, T *dst, const float min,
640
  const float max, const int k,
641
  queue_ptr stream) {
642
  const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
643
- stream->parallel_for(
644
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
645
- sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
646
- sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
647
- [=](sycl::nd_item<3> item_ct1) {
648
- clamp(x, dst, min, max, k, item_ct1);
649
- });
650
  }
651
 
652
  inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
329
  const int ne12, const int nb1, const int nb2,
330
  const int offset, queue_ptr stream) {
331
  int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
332
+ sycl_parallel_for(stream,
333
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
334
+ sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
335
+ [=](sycl::nd_item<3> item_ct1) {
336
+ acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, item_ct1);
337
+ });
 
 
338
  }
339
 
340
  template<typename T>
341
  static void gelu_sycl(const T *x, T *dst, const int k,
342
  queue_ptr stream) {
343
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
344
+ sycl_parallel_for(stream,
345
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
346
+ sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
347
+ [=](sycl::nd_item<3> item_ct1) { gelu(x, dst, k, item_ct1); });
 
 
 
348
  }
349
 
350
  template<typename T>
351
  static void silu_sycl(const T *x, T *dst, const int k,
352
  queue_ptr stream) {
353
  const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
354
+ sycl_parallel_for(stream,
355
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
356
+ sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
357
+ [=](sycl::nd_item<3> item_ct1) { silu(x, dst, k, item_ct1); });
 
 
 
358
  }
359
 
360
  template<typename T>
361
  static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
362
  // hard code for now
363
  const int num_blocks = ceil_div(k, 256);
364
+ sycl_parallel_for(
365
+ stream, sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)),
366
+ [=](sycl::nd_item<3> item_ct1) { sgn(x, dst, k, item_ct1); });
 
367
  }
368
 
369
  template<typename T>
370
  static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
371
  // hard code for now
372
  const int num_blocks = ceil_div(k, 256);
373
+ sycl_parallel_for(
374
+ stream,
375
+ sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)),
376
+ [=](sycl::nd_item<3> item_ct1) { abs_op(x, dst, k, item_ct1); });
377
  }
378
 
379
 
 
381
  static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
382
  // hard code for now
383
  const int num_blocks = ceil_div(k, 256);
384
+ sycl_parallel_for(
385
+ stream,
386
+ sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)),
387
+ [=](sycl::nd_item<3> item_ct1) { elu_op(x, dst, k, item_ct1); });
388
  }
389
 
390
  template<typename T>
391
  static void gelu_quick_sycl(const T *x, T *dst, const int k,
392
  queue_ptr stream) {
393
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
394
+ sycl_parallel_for(stream,
395
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
396
+ sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
397
+ [=](sycl::nd_item<3> item_ct1) { gelu_quick(x, dst, k, item_ct1); });
 
 
 
398
  }
399
 
400
 
 
402
  static void gelu_erf_sycl(const T *x, T *dst, const int k,
403
  queue_ptr stream) {
404
  const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
405
+ sycl_parallel_for(stream,
406
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
407
+ sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
408
+ [=](sycl::nd_item<3> item_ct1) { gelu_erf(x, dst, k, item_ct1); });
 
 
 
409
  }
410
 
411
  template<typename T>
412
  static void tanh_sycl(const T *x, T *dst, const int k,
413
  queue_ptr stream) {
414
  const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
415
+ sycl_parallel_for(stream,
416
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
417
+ sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
418
+ [=](sycl::nd_item<3> item_ct1) { tanh(x, dst, k, item_ct1); });
 
 
 
419
  }
420
 
421
  template<typename T>
422
  static void relu_sycl(const T *x, T *dst, const int k,
423
  queue_ptr stream) {
424
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
425
+ sycl_parallel_for(stream,
426
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
427
+ sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
428
+ [=](sycl::nd_item<3> item_ct1) { relu(x, dst, k, item_ct1); });
 
 
 
429
  }
430
 
431
  template<typename T>
432
  static void hardsigmoid_sycl(const T *x, T *dst, const int k,
433
  queue_ptr stream) {
434
  const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
435
+ sycl_parallel_for(
436
+ stream,
437
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
438
  sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
439
+ [=](sycl::nd_item<3> item_ct1) { hardsigmoid(x, dst, k, item_ct1); });
 
 
440
  }
441
 
442
  template<typename T>
443
  static void hardswish_sycl(const T *x, T *dst, const int k,
444
  queue_ptr stream) {
445
  const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
446
+ sycl_parallel_for(
447
+ stream,
448
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
449
  sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
450
+ [=](sycl::nd_item<3> item_ct1) { hardswish(x, dst, k, item_ct1); });
 
 
451
  }
452
 
453
  template<typename T>
454
  static void exp_sycl(const T *x, T *dst, const int k,
455
  queue_ptr stream) {
456
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
457
+ sycl_parallel_for(stream,
458
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
459
+ sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
460
+ [=](sycl::nd_item<3> item_ct1) { exp(x, dst, k, item_ct1); });
 
 
 
461
  }
462
 
463
  template<typename T>
464
  static void log_sycl(const T *x, T *dst, const int k,
465
  queue_ptr stream) {
466
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
467
+ sycl_parallel_for(stream,
468
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
469
+ sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
470
+ [=](sycl::nd_item<3> item_ct1) { log(x, dst, k, item_ct1); });
 
 
 
471
  }
472
 
473
  template<typename T>
474
  static void neg_sycl(const T *x, T *dst, const int k,
475
  queue_ptr stream) {
476
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
477
+ sycl_parallel_for(stream,
478
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
479
+ sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
480
+ [=](sycl::nd_item<3> item_ct1) { neg(x, dst, k, item_ct1); });
 
 
 
481
  }
482
 
483
  template<typename T>
484
  static void step_sycl(const T *x, T *dst, const int k,
485
  queue_ptr stream) {
486
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
487
+ sycl_parallel_for(stream,
488
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
489
+ sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
490
+ [=](sycl::nd_item<3> item_ct1) { step(x, dst, k, item_ct1); });
 
 
 
491
  }
492
 
493
  template<typename T>
494
  static void sigmoid_sycl(const T *x, T *dst, const int k,
495
  queue_ptr stream) {
496
  const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
497
+ sycl_parallel_for(
498
+ stream,
499
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
500
  sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
501
+ [=](sycl::nd_item<3> item_ct1) { sigmoid(x, dst, k, item_ct1); });
 
 
502
  }
503
 
504
  template<typename T>
505
  static void sqrt_sycl(const T *x, T *dst, const int k,
506
  queue_ptr stream) {
507
  const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
508
+ sycl_parallel_for(stream,
509
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
510
+ sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
511
+ [=](sycl::nd_item<3> item_ct1) { sqrt(x, dst, k, item_ct1); });
 
 
 
512
  }
513
 
514
  template<typename T>
515
  static void sin_sycl(const T *x, T *dst, const int k,
516
  queue_ptr stream) {
517
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
518
+ sycl_parallel_for(stream,
519
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
520
+ sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
521
+ [=](sycl::nd_item<3> item_ct1) { sin(x, dst, k, item_ct1); });
 
 
 
522
  }
523
 
524
  template<typename T>
525
  static void cos_sycl(const T *x, T *dst, const int k,
526
  queue_ptr stream) {
527
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
528
+ sycl_parallel_for(stream,
529
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
530
+ sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
531
+ [=](sycl::nd_item<3> item_ct1) { cos(x, dst, k, item_ct1); });
 
 
 
532
  }
533
 
534
  template<typename T>
 
536
  const float negative_slope,
537
  queue_ptr stream) {
538
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
539
+ sycl_parallel_for(stream,
540
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
541
+ sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
542
+ [=](sycl::nd_item<3> item_ct1) { leaky_relu(x, dst, k, negative_slope, item_ct1); });
 
 
 
543
  }
544
 
545
  template<typename T>
546
  static void sqr_sycl(const T *x, T *dst, const int k,
547
  queue_ptr stream) {
548
  const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
549
+ sycl_parallel_for(stream,
550
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
551
+ sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
552
+ [=](sycl::nd_item<3> item_ct1) { sqr(x, dst, k, item_ct1); });
 
 
 
553
  }
554
 
555
  template<typename T>
 
560
  int dst_size = ne10 * ne11 * ne12 * ne13;
561
  int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
562
  sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
563
+ sycl_parallel_for<1>(
564
+ stream, sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
 
565
  upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
566
  });
567
  }
 
572
  const int ne1, const int ne2, queue_ptr stream) {
573
  int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
574
  sycl::range<3> gridDim(ne2, ne1, num_blocks);
575
+ sycl_parallel_for(stream,
576
+ sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
577
+ sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
578
+ [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
 
 
579
  }
580
 
581
  template<typename T>
 
583
  const float max, const int k,
584
  queue_ptr stream) {
585
  const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
586
+ sycl_parallel_for(stream,
587
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
588
+ sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
589
+ [=](sycl::nd_item<3> item_ct1) { clamp(x, dst, min, max, k, item_ct1); });
 
 
 
590
  }
591
 
592
  inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml/src/ggml-sycl/getrows.cpp CHANGED
@@ -118,12 +118,10 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
118
 
119
  GGML_ASSERT(ne00 % 2 == 0);
120
 
121
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
122
- [=](sycl::nd_item<3> item_ct1) {
123
- k_get_rows<qk, qr, dq>(
124
- src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
125
- s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
126
- });
127
 
128
  GGML_UNUSED(dst);
129
  GGML_UNUSED(ctx);
@@ -156,9 +154,8 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
156
  dpct::has_capability_or_fail(stream->get_device(),
157
  {sycl::aspect::fp16});
158
 
159
- stream->parallel_for(
160
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
161
- [=](sycl::nd_item<3> item_ct1) {
162
  k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
163
  s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
164
  });
 
118
 
119
  GGML_ASSERT(ne00 % 2 == 0);
120
 
121
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
122
+ k_get_rows<qk, qr, dq>(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12,
123
+ item_ct1);
124
+ });
 
 
125
 
126
  GGML_UNUSED(dst);
127
  GGML_UNUSED(ctx);
 
154
  dpct::has_capability_or_fail(stream->get_device(),
155
  {sycl::aspect::fp16});
156
 
157
+ sycl_parallel_for(
158
+ stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
159
  k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
160
  s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
161
  });
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -1887,13 +1887,12 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
1887
  const size_t shared_mem = ncols_pad * sizeof(int);
1888
 
1889
  if (order == GGML_SORT_ORDER_ASC) {
1890
- stream->submit([&](sycl::handler &cgh) {
1891
  sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
1892
  sycl::range<1>(shared_mem), cgh);
1893
 
1894
- cgh.parallel_for(
1895
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1896
- [=](sycl::nd_item<3> item_ct1) {
1897
  k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
1898
  x, dst, ncols, ncols_pad, item_ct1,
1899
  dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
@@ -1901,13 +1900,12 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
1901
  });
1902
  });
1903
  } else if (order == GGML_SORT_ORDER_DESC) {
1904
- stream->submit([&](sycl::handler &cgh) {
1905
  sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
1906
  sycl::range<1>(shared_mem), cgh);
1907
 
1908
- cgh.parallel_for(
1909
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1910
- [=](sycl::nd_item<3> item_ct1) {
1911
  k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
1912
  x, dst, ncols, ncols_pad, item_ct1,
1913
  dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
@@ -1925,50 +1923,47 @@ static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
1925
  const sycl::range<3> block_nums(1, nrows, 1);
1926
  const size_t shared_mem = 256 * sizeof(float);
1927
 
1928
- stream->submit([&](sycl::handler &cgh) {
1929
  sycl::local_accessor<float, 1> shared_data(
1930
  sycl::range<1>(shared_mem/sizeof(float)), cgh);
1931
  sycl::local_accessor<int, 1> shared_indices(
1932
  sycl::range<1>(shared_mem/sizeof(float)), cgh);
1933
 
1934
- cgh.parallel_for(
1935
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1936
- [=](sycl::nd_item<3> item_ct1) {
1937
- const int tid = item_ct1.get_local_id(2);
1938
- const int row = item_ct1.get_global_id(1);
1939
-
1940
- float max_val = -INFINITY;
1941
- int max_idx = -1;
1942
-
1943
- for (int col = tid; col < ncols; col += 256) {
1944
- float val = x[row * ncols + col];
1945
- if (val > max_val) {
1946
- max_val = val;
1947
- max_idx = col;
1948
- }
1949
- }
1950
 
1951
- shared_data[tid] = max_val;
1952
- shared_indices[tid] = max_idx;
1953
- item_ct1.barrier(sycl::access::fence_space::local_space);
1954
 
1955
- for (int stride = 256/2; stride > 0; stride >>= 1) {
1956
- if (tid < stride) {
1957
- float val1 = shared_data[tid];
1958
- float val2 = shared_data[tid + stride];
1959
- if (val2 > val1) {
1960
- shared_data[tid] = val2;
1961
- shared_indices[tid] = shared_indices[tid + stride];
1962
- }
1963
- }
1964
- item_ct1.barrier(sycl::access::fence_space::local_space);
1965
  }
 
1966
 
 
 
 
1967
 
1968
- if (tid == 0) {
1969
- dst[row] = shared_indices[0];
 
 
 
 
 
 
1970
  }
1971
- });
 
 
 
 
 
 
1972
  });
1973
  }
1974
  static void diag_mask_inf_f32_sycl(const float *x, float *dst,
@@ -2952,7 +2947,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2952
  void ** ptrs_dst_get = ptrs_dst.get();
2953
  size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
2954
  size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
2955
- cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2956
  k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
2957
  nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
2958
  });
@@ -3456,7 +3451,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3456
  {
3457
  sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u));
3458
  sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
3459
- stream->submit([&](sycl::handler &cgh) {
3460
  sycl::local_accessor<int, 0> src1_row_acc(cgh);
3461
 
3462
  char *__restrict src1_contiguous_get =
@@ -3468,9 +3463,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3468
  size_t ids_nb_ct6 = ids->nb[1];
3469
  size_t ids_nb_ct7 = ids->nb[0];
3470
 
3471
- cgh.parallel_for(
3472
- sycl::nd_range<3>(grid_dims * block_dims, block_dims),
3473
- [=](sycl::nd_item<3> item_ct1) {
3474
  k_copy_src1_to_contiguous(
3475
  src1_original, src1_contiguous_get,
3476
  dev_cur_src1_row_get,
@@ -3501,15 +3495,14 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3501
  {
3502
  sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u));
3503
  sycl::range<3> grid_dims(1, 1, num_src1_rows);
3504
- stream->submit([&](sycl::handler &cgh) {
3505
  const char *__restrict dst_contiguous_get =
3506
  dst_contiguous.get();
3507
  const mmid_row_mapping *__restrict dev_row_mapping_get =
3508
  dev_row_mapping.get();
3509
 
3510
- cgh.parallel_for(
3511
- sycl::nd_range<3>(grid_dims * block_dims, block_dims),
3512
- [=](sycl::nd_item<3> item_ct1) {
3513
  k_copy_dst_from_contiguous(dst_original,
3514
  dst_contiguous_get,
3515
  dev_row_mapping_get,
 
1887
  const size_t shared_mem = ncols_pad * sizeof(int);
1888
 
1889
  if (order == GGML_SORT_ORDER_ASC) {
1890
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1891
  sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
1892
  sycl::range<1>(shared_mem), cgh);
1893
 
1894
+ sycl_parallel_for(
1895
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
1896
  k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
1897
  x, dst, ncols, ncols_pad, item_ct1,
1898
  dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
 
1900
  });
1901
  });
1902
  } else if (order == GGML_SORT_ORDER_DESC) {
1903
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1904
  sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
1905
  sycl::range<1>(shared_mem), cgh);
1906
 
1907
+ sycl_parallel_for(
1908
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
1909
  k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
1910
  x, dst, ncols, ncols_pad, item_ct1,
1911
  dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
 
1923
  const sycl::range<3> block_nums(1, nrows, 1);
1924
  const size_t shared_mem = 256 * sizeof(float);
1925
 
1926
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1927
  sycl::local_accessor<float, 1> shared_data(
1928
  sycl::range<1>(shared_mem/sizeof(float)), cgh);
1929
  sycl::local_accessor<int, 1> shared_indices(
1930
  sycl::range<1>(shared_mem/sizeof(float)), cgh);
1931
 
1932
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
1933
+ const int tid = item_ct1.get_local_id(2);
1934
+ const int row = item_ct1.get_global_id(1);
 
 
 
 
 
 
 
 
 
 
 
 
 
1935
 
1936
+ float max_val = -INFINITY;
1937
+ int max_idx = -1;
 
1938
 
1939
+ for (int col = tid; col < ncols; col += 256) {
1940
+ float val = x[row * ncols + col];
1941
+ if (val > max_val) {
1942
+ max_val = val;
1943
+ max_idx = col;
 
 
 
 
 
1944
  }
1945
+ }
1946
 
1947
+ shared_data[tid] = max_val;
1948
+ shared_indices[tid] = max_idx;
1949
+ item_ct1.barrier(sycl::access::fence_space::local_space);
1950
 
1951
+ for (int stride = 256 / 2; stride > 0; stride >>= 1) {
1952
+ if (tid < stride) {
1953
+ float val1 = shared_data[tid];
1954
+ float val2 = shared_data[tid + stride];
1955
+ if (val2 > val1) {
1956
+ shared_data[tid] = val2;
1957
+ shared_indices[tid] = shared_indices[tid + stride];
1958
+ }
1959
  }
1960
+ item_ct1.barrier(sycl::access::fence_space::local_space);
1961
+ }
1962
+
1963
+ if (tid == 0) {
1964
+ dst[row] = shared_indices[0];
1965
+ }
1966
+ });
1967
  });
1968
  }
1969
  static void diag_mask_inf_f32_sycl(const float *x, float *dst,
 
2947
  void ** ptrs_dst_get = ptrs_dst.get();
2948
  size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
2949
  size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
2950
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2951
  k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
2952
  nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
2953
  });
 
3451
  {
3452
  sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u));
3453
  sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
3454
+ sycl_launch(stream, [&](sycl::handler & cgh) {
3455
  sycl::local_accessor<int, 0> src1_row_acc(cgh);
3456
 
3457
  char *__restrict src1_contiguous_get =
 
3463
  size_t ids_nb_ct6 = ids->nb[1];
3464
  size_t ids_nb_ct7 = ids->nb[0];
3465
 
3466
+ sycl_parallel_for(
3467
+ cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
3468
  k_copy_src1_to_contiguous(
3469
  src1_original, src1_contiguous_get,
3470
  dev_cur_src1_row_get,
 
3495
  {
3496
  sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u));
3497
  sycl::range<3> grid_dims(1, 1, num_src1_rows);
3498
+ sycl_launch(stream, [&](sycl::handler & cgh) {
3499
  const char *__restrict dst_contiguous_get =
3500
  dst_contiguous.get();
3501
  const mmid_row_mapping *__restrict dev_row_mapping_get =
3502
  dev_row_mapping.get();
3503
 
3504
+ sycl_parallel_for(
3505
+ cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
3506
  k_copy_dst_from_contiguous(dst_original,
3507
  dst_contiguous_get,
3508
  dev_row_mapping_get,
ggml/src/ggml-sycl/gla.cpp CHANGED
@@ -11,13 +11,13 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
11
  const u_int n_seq_tokens = T / B;
12
  sycl::range<1> block_dims((C / H));
13
  sycl::range<1> grid_dims((B * H));
14
- stream->submit([&](sycl::handler & cgh) {
15
  /* local memory accessors*/
16
  auto _k = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
17
  auto _r = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
18
  auto _td = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
19
 
20
- cgh.parallel_for(sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
21
  u_int tid = item.get_local_id(0);
22
  u_int bid = item.get_group(0);
23
 
 
11
  const u_int n_seq_tokens = T / B;
12
  sycl::range<1> block_dims((C / H));
13
  sycl::range<1> grid_dims((B * H));
14
+ sycl_launch(stream, [&](sycl::handler & cgh) {
15
  /* local memory accessors*/
16
  auto _k = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
17
  auto _r = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
18
  auto _td = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
19
 
20
+ sycl_parallel_for<1>(cgh, sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
21
  u_int tid = item.get_local_id(0);
22
  u_int bid = item.get_group(0);
23
 
ggml/src/ggml-sycl/im2col.cpp CHANGED
@@ -70,7 +70,7 @@ static void im2col_sycl_internal(const float * x, T * dst, int64_t IW, int64_t I
70
 
71
  const int64_t CHW = IC * KH * KW;
72
 
73
- stream->parallel_for(sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) {
74
  im2col_kernel<T>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, CHW, s0, s1,
75
  p0, p1, d0, d1, item_ct1);
76
  });
 
70
 
71
  const int64_t CHW = IC * KH * KW;
72
 
73
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) {
74
  im2col_kernel<T>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, CHW, s0, s1,
75
  p0, p1, d0, d1, item_ct1);
76
  });
ggml/src/ggml-sycl/mmq.cpp CHANGED
@@ -1818,7 +1818,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
1818
  dpct::has_capability_or_fail(stream->get_device(),
1819
  {sycl::aspect::fp16});
1820
 
1821
- stream->submit([&](sycl::handler &cgh) {
1822
  sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
1823
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
1824
  sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
@@ -1829,9 +1829,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
1829
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1830
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1831
 
1832
- cgh.parallel_for(
1833
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1834
- [=](sycl::nd_item<3> item_ct1) {
1835
  mul_mat_q4_0<need_check>(
1836
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1837
  nrows_dst, item_ct1,
@@ -1853,7 +1852,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
1853
  dpct::has_capability_or_fail(stream->get_device(),
1854
  {sycl::aspect::fp16});
1855
 
1856
- stream->submit([&](sycl::handler &cgh) {
1857
  sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
1858
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
1859
  sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
@@ -1864,9 +1863,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
1864
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1865
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1866
 
1867
- cgh.parallel_for(
1868
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1869
- [=](sycl::nd_item<3> item_ct1) {
1870
  mul_mat_q4_0<need_check>(
1871
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1872
  nrows_dst, item_ct1,
@@ -1933,7 +1931,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
1933
  dpct::has_capability_or_fail(stream->get_device(),
1934
  {sycl::aspect::fp16});
1935
 
1936
- stream->submit([&](sycl::handler &cgh) {
1937
  sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
1938
  sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
1939
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
@@ -1944,9 +1942,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
1944
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1945
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1946
 
1947
- cgh.parallel_for(
1948
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1949
- [=](sycl::nd_item<3> item_ct1) {
1950
  mul_mat_q4_1<need_check>(
1951
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1952
  nrows_dst, item_ct1,
@@ -1968,7 +1965,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
1968
  dpct::has_capability_or_fail(stream->get_device(),
1969
  {sycl::aspect::fp16});
1970
 
1971
- stream->submit([&](sycl::handler &cgh) {
1972
  sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
1973
  sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
1974
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
@@ -1979,9 +1976,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
1979
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1980
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1981
 
1982
- cgh.parallel_for(
1983
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1984
- [=](sycl::nd_item<3> item_ct1) {
1985
  mul_mat_q4_1<need_check>(
1986
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1987
  nrows_dst, item_ct1,
@@ -2048,7 +2044,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
2048
  dpct::has_capability_or_fail(stream->get_device(),
2049
  {sycl::aspect::fp16});
2050
 
2051
- stream->submit([&](sycl::handler &cgh) {
2052
  sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
2053
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2054
  sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
@@ -2059,9 +2055,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
2059
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2060
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2061
 
2062
- cgh.parallel_for(
2063
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2064
- [=](sycl::nd_item<3> item_ct1) {
2065
  mul_mat_q5_0<need_check>(
2066
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2067
  nrows_dst, item_ct1,
@@ -2083,7 +2078,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
2083
  dpct::has_capability_or_fail(stream->get_device(),
2084
  {sycl::aspect::fp16});
2085
 
2086
- stream->submit([&](sycl::handler &cgh) {
2087
  sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
2088
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2089
  sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
@@ -2094,9 +2089,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
2094
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2095
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2096
 
2097
- cgh.parallel_for(
2098
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2099
- [=](sycl::nd_item<3> item_ct1) {
2100
  mul_mat_q5_0<need_check>(
2101
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2102
  nrows_dst, item_ct1,
@@ -2163,7 +2157,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
2163
  dpct::has_capability_or_fail(stream->get_device(),
2164
  {sycl::aspect::fp16});
2165
 
2166
- stream->submit([&](sycl::handler &cgh) {
2167
  sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
2168
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2169
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
@@ -2174,9 +2168,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
2174
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2175
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2176
 
2177
- cgh.parallel_for(
2178
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2179
- [=](sycl::nd_item<3> item_ct1) {
2180
  mul_mat_q5_1<need_check>(
2181
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2182
  nrows_dst, item_ct1,
@@ -2198,7 +2191,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
2198
  dpct::has_capability_or_fail(stream->get_device(),
2199
  {sycl::aspect::fp16});
2200
 
2201
- stream->submit([&](sycl::handler &cgh) {
2202
  sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
2203
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2204
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
@@ -2209,9 +2202,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
2209
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2210
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2211
 
2212
- cgh.parallel_for(
2213
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2214
- [=](sycl::nd_item<3> item_ct1) {
2215
  mul_mat_q5_1<need_check>(
2216
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2217
  nrows_dst, item_ct1,
@@ -2278,7 +2270,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
2278
  dpct::has_capability_or_fail(stream->get_device(),
2279
  {sycl::aspect::fp16});
2280
 
2281
- stream->submit([&](sycl::handler &cgh) {
2282
  sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
2283
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2284
  sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
@@ -2289,9 +2281,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
2289
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2290
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2291
 
2292
- cgh.parallel_for(
2293
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2294
- [=](sycl::nd_item<3> item_ct1) {
2295
  mul_mat_q8_0<need_check>(
2296
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2297
  nrows_dst, item_ct1,
@@ -2313,7 +2304,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
2313
  dpct::has_capability_or_fail(stream->get_device(),
2314
  {sycl::aspect::fp16});
2315
 
2316
- stream->submit([&](sycl::handler &cgh) {
2317
  sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
2318
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2319
  sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
@@ -2324,9 +2315,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
2324
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2325
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2326
 
2327
- cgh.parallel_for(
2328
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2329
- [=](sycl::nd_item<3> item_ct1) {
2330
  mul_mat_q8_0<need_check>(
2331
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2332
  nrows_dst, item_ct1,
@@ -2393,7 +2383,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
2393
  dpct::has_capability_or_fail(stream->get_device(),
2394
  {sycl::aspect::fp16});
2395
 
2396
- stream->submit([&](sycl::handler &cgh) {
2397
  sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
2398
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2399
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
@@ -2406,9 +2396,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
2406
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2407
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2408
 
2409
- cgh.parallel_for(
2410
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2411
- [=](sycl::nd_item<3> item_ct1) {
2412
  mul_mat_q2_K<need_check>(
2413
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2414
  nrows_dst, item_ct1,
@@ -2431,7 +2420,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
2431
  dpct::has_capability_or_fail(stream->get_device(),
2432
  {sycl::aspect::fp16});
2433
 
2434
- stream->submit([&](sycl::handler &cgh) {
2435
  sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
2436
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2437
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
@@ -2444,9 +2433,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
2444
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2445
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2446
 
2447
- cgh.parallel_for(
2448
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2449
- [=](sycl::nd_item<3> item_ct1) {
2450
  mul_mat_q2_K<need_check>(
2451
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2452
  nrows_dst, item_ct1,
@@ -2516,7 +2504,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
2516
  dpct::has_capability_or_fail(stream->get_device(),
2517
  {sycl::aspect::fp16});
2518
 
2519
- stream->submit([&](sycl::handler &cgh) {
2520
  sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
2521
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2522
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
@@ -2531,9 +2519,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
2531
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2532
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2533
 
2534
- cgh.parallel_for(
2535
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2536
- [=](sycl::nd_item<3> item_ct1) {
2537
  mul_mat_q3_K<need_check>(
2538
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2539
  nrows_dst, item_ct1,
@@ -2557,7 +2544,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
2557
  dpct::has_capability_or_fail(stream->get_device(),
2558
  {sycl::aspect::fp16});
2559
 
2560
- stream->submit([&](sycl::handler &cgh) {
2561
  sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
2562
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2563
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
@@ -2572,9 +2559,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
2572
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2573
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2574
 
2575
- cgh.parallel_for(
2576
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2577
- [=](sycl::nd_item<3> item_ct1) {
2578
  mul_mat_q3_K<need_check>(
2579
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2580
  nrows_dst, item_ct1,
@@ -2644,7 +2630,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
2644
  dpct::has_capability_or_fail(stream->get_device(),
2645
  {sycl::aspect::fp16});
2646
 
2647
- stream->submit([&](sycl::handler &cgh) {
2648
  sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
2649
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2650
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
@@ -2657,9 +2643,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
2657
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2658
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2659
 
2660
- cgh.parallel_for(
2661
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2662
- [=](sycl::nd_item<3> item_ct1) {
2663
  mul_mat_q4_K<need_check>(
2664
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2665
  nrows_dst, item_ct1,
@@ -2682,7 +2667,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
2682
  dpct::has_capability_or_fail(stream->get_device(),
2683
  {sycl::aspect::fp16});
2684
 
2685
- stream->submit([&](sycl::handler &cgh) {
2686
  sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
2687
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2688
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
@@ -2695,9 +2680,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
2695
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2696
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2697
 
2698
- cgh.parallel_for(
2699
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2700
- [=](sycl::nd_item<3> item_ct1) {
2701
  mul_mat_q4_K<need_check>(
2702
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2703
  nrows_dst, item_ct1,
@@ -2765,7 +2749,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
2765
  dpct::has_capability_or_fail(stream->get_device(),
2766
  {sycl::aspect::fp16});
2767
 
2768
- stream->submit([&](sycl::handler &cgh) {
2769
  sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
2770
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2771
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
@@ -2778,9 +2762,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
2778
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2779
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2780
 
2781
- cgh.parallel_for(
2782
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2783
- [=](sycl::nd_item<3> item_ct1) {
2784
  mul_mat_q5_K<need_check>(
2785
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2786
  nrows_dst, item_ct1,
@@ -2803,7 +2786,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
2803
  dpct::has_capability_or_fail(stream->get_device(),
2804
  {sycl::aspect::fp16});
2805
 
2806
- stream->submit([&](sycl::handler &cgh) {
2807
  sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
2808
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2809
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
@@ -2816,9 +2799,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
2816
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2817
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2818
 
2819
- cgh.parallel_for(
2820
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2821
- [=](sycl::nd_item<3> item_ct1) {
2822
  mul_mat_q5_K<need_check>(
2823
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2824
  nrows_dst, item_ct1,
@@ -2886,7 +2868,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
2886
  dpct::has_capability_or_fail(stream->get_device(),
2887
  {sycl::aspect::fp16});
2888
 
2889
- stream->submit([&](sycl::handler &cgh) {
2890
  sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
2891
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2892
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
@@ -2899,9 +2881,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
2899
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2900
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2901
 
2902
- cgh.parallel_for(
2903
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2904
- [=](sycl::nd_item<3> item_ct1) {
2905
  mul_mat_q6_K<need_check>(
2906
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2907
  nrows_dst, item_ct1,
@@ -2924,7 +2905,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
2924
  dpct::has_capability_or_fail(stream->get_device(),
2925
  {sycl::aspect::fp16});
2926
 
2927
- stream->submit([&](sycl::handler &cgh) {
2928
  sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
2929
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2930
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
@@ -2937,9 +2918,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
2937
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2938
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2939
 
2940
- cgh.parallel_for(
2941
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2942
- [=](sycl::nd_item<3> item_ct1) {
2943
  mul_mat_q6_K<need_check>(
2944
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2945
  nrows_dst, item_ct1,
 
1818
  dpct::has_capability_or_fail(stream->get_device(),
1819
  {sycl::aspect::fp16});
1820
 
1821
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1822
  sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
1823
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
1824
  sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
 
1829
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1830
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1831
 
1832
+ sycl_parallel_for(
1833
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
1834
  mul_mat_q4_0<need_check>(
1835
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1836
  nrows_dst, item_ct1,
 
1852
  dpct::has_capability_or_fail(stream->get_device(),
1853
  {sycl::aspect::fp16});
1854
 
1855
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1856
  sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
1857
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
1858
  sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
 
1863
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1864
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1865
 
1866
+ sycl_parallel_for(
1867
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
1868
  mul_mat_q4_0<need_check>(
1869
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1870
  nrows_dst, item_ct1,
 
1931
  dpct::has_capability_or_fail(stream->get_device(),
1932
  {sycl::aspect::fp16});
1933
 
1934
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1935
  sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
1936
  sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
1937
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
 
1942
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1943
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1944
 
1945
+ sycl_parallel_for(
1946
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
1947
  mul_mat_q4_1<need_check>(
1948
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1949
  nrows_dst, item_ct1,
 
1965
  dpct::has_capability_or_fail(stream->get_device(),
1966
  {sycl::aspect::fp16});
1967
 
1968
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1969
  sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
1970
  sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
1971
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
 
1976
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1977
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1978
 
1979
+ sycl_parallel_for(
1980
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
1981
  mul_mat_q4_1<need_check>(
1982
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1983
  nrows_dst, item_ct1,
 
2044
  dpct::has_capability_or_fail(stream->get_device(),
2045
  {sycl::aspect::fp16});
2046
 
2047
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2048
  sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
2049
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2050
  sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
 
2055
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2056
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2057
 
2058
+ sycl_parallel_for(
2059
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2060
  mul_mat_q5_0<need_check>(
2061
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2062
  nrows_dst, item_ct1,
 
2078
  dpct::has_capability_or_fail(stream->get_device(),
2079
  {sycl::aspect::fp16});
2080
 
2081
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2082
  sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
2083
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2084
  sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
 
2089
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2090
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2091
 
2092
+ sycl_parallel_for(
2093
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2094
  mul_mat_q5_0<need_check>(
2095
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2096
  nrows_dst, item_ct1,
 
2157
  dpct::has_capability_or_fail(stream->get_device(),
2158
  {sycl::aspect::fp16});
2159
 
2160
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2161
  sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
2162
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2163
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
 
2168
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2169
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2170
 
2171
+ sycl_parallel_for(
2172
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2173
  mul_mat_q5_1<need_check>(
2174
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2175
  nrows_dst, item_ct1,
 
2191
  dpct::has_capability_or_fail(stream->get_device(),
2192
  {sycl::aspect::fp16});
2193
 
2194
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2195
  sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
2196
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2197
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
 
2202
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2203
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2204
 
2205
+ sycl_parallel_for(
2206
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2207
  mul_mat_q5_1<need_check>(
2208
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2209
  nrows_dst, item_ct1,
 
2270
  dpct::has_capability_or_fail(stream->get_device(),
2271
  {sycl::aspect::fp16});
2272
 
2273
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2274
  sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
2275
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2276
  sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
 
2281
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2282
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2283
 
2284
+ sycl_parallel_for(
2285
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2286
  mul_mat_q8_0<need_check>(
2287
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2288
  nrows_dst, item_ct1,
 
2304
  dpct::has_capability_or_fail(stream->get_device(),
2305
  {sycl::aspect::fp16});
2306
 
2307
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2308
  sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
2309
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2310
  sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
 
2315
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2316
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2317
 
2318
+ sycl_parallel_for(
2319
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2320
  mul_mat_q8_0<need_check>(
2321
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2322
  nrows_dst, item_ct1,
 
2383
  dpct::has_capability_or_fail(stream->get_device(),
2384
  {sycl::aspect::fp16});
2385
 
2386
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2387
  sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
2388
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2389
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
 
2396
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2397
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2398
 
2399
+ sycl_parallel_for(
2400
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2401
  mul_mat_q2_K<need_check>(
2402
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2403
  nrows_dst, item_ct1,
 
2420
  dpct::has_capability_or_fail(stream->get_device(),
2421
  {sycl::aspect::fp16});
2422
 
2423
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2424
  sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
2425
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2426
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
 
2433
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2434
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2435
 
2436
+ sycl_parallel_for(
2437
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2438
  mul_mat_q2_K<need_check>(
2439
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2440
  nrows_dst, item_ct1,
 
2504
  dpct::has_capability_or_fail(stream->get_device(),
2505
  {sycl::aspect::fp16});
2506
 
2507
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2508
  sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
2509
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2510
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
 
2519
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2520
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2521
 
2522
+ sycl_parallel_for(
2523
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2524
  mul_mat_q3_K<need_check>(
2525
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2526
  nrows_dst, item_ct1,
 
2544
  dpct::has_capability_or_fail(stream->get_device(),
2545
  {sycl::aspect::fp16});
2546
 
2547
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2548
  sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
2549
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2550
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
 
2559
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2560
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2561
 
2562
+ sycl_parallel_for(
2563
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2564
  mul_mat_q3_K<need_check>(
2565
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2566
  nrows_dst, item_ct1,
 
2630
  dpct::has_capability_or_fail(stream->get_device(),
2631
  {sycl::aspect::fp16});
2632
 
2633
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2634
  sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
2635
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2636
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
 
2643
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2644
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2645
 
2646
+ sycl_parallel_for(
2647
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2648
  mul_mat_q4_K<need_check>(
2649
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2650
  nrows_dst, item_ct1,
 
2667
  dpct::has_capability_or_fail(stream->get_device(),
2668
  {sycl::aspect::fp16});
2669
 
2670
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2671
  sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
2672
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2673
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
 
2680
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2681
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2682
 
2683
+ sycl_parallel_for(
2684
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2685
  mul_mat_q4_K<need_check>(
2686
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2687
  nrows_dst, item_ct1,
 
2749
  dpct::has_capability_or_fail(stream->get_device(),
2750
  {sycl::aspect::fp16});
2751
 
2752
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2753
  sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
2754
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2755
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
 
2762
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2763
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2764
 
2765
+ sycl_parallel_for(
2766
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2767
  mul_mat_q5_K<need_check>(
2768
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2769
  nrows_dst, item_ct1,
 
2786
  dpct::has_capability_or_fail(stream->get_device(),
2787
  {sycl::aspect::fp16});
2788
 
2789
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2790
  sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
2791
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2792
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
 
2799
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2800
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2801
 
2802
+ sycl_parallel_for(
2803
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2804
  mul_mat_q5_K<need_check>(
2805
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2806
  nrows_dst, item_ct1,
 
2868
  dpct::has_capability_or_fail(stream->get_device(),
2869
  {sycl::aspect::fp16});
2870
 
2871
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2872
  sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
2873
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2874
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
 
2881
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2882
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2883
 
2884
+ sycl_parallel_for(
2885
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2886
  mul_mat_q6_K<need_check>(
2887
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2888
  nrows_dst, item_ct1,
 
2905
  dpct::has_capability_or_fail(stream->get_device(),
2906
  {sycl::aspect::fp16});
2907
 
2908
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2909
  sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
2910
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2911
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
 
2918
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2919
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2920
 
2921
+ sycl_parallel_for(
2922
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
2923
  mul_mat_q6_K<need_check>(
2924
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2925
  nrows_dst, item_ct1,
ggml/src/ggml-sycl/mmvq.cpp CHANGED
@@ -544,12 +544,12 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy,
544
  const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
545
  const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
546
 
547
- stream->submit([&](sycl::handler & cgh) {
548
- cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
549
- [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
550
- mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
551
- nd_item);
552
- });
553
  });
554
  }
555
 
@@ -561,12 +561,12 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float *
561
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
562
 
563
  {
564
- stream->submit([&](sycl::handler & cgh) {
565
- cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
566
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
567
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
568
- vx, vy, dst, ncols, nrows, item_ct1);
569
- });
570
  });
571
  }
572
  }
@@ -580,17 +580,12 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
580
  const sycl::range<3> block_nums(1, 1, block_num_y);
581
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
582
  {
583
-
584
- stream->submit([&](sycl::handler &cgh) {
585
-
586
- cgh.parallel_for(
587
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
588
- [=](sycl::nd_item<3> item_ct1)
589
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
590
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
591
- VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
592
- vx, vy, dst, ncols, nrows, item_ct1);
593
- });
594
  });
595
  }
596
  }
@@ -604,17 +599,12 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
604
  const sycl::range<3> block_nums(1, 1, block_num_y);
605
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
606
  {
607
-
608
- stream->submit([&](sycl::handler &cgh) {
609
-
610
- cgh.parallel_for(
611
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
612
- [=](sycl::nd_item<3> item_ct1)
613
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
614
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
615
- VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
616
- vx, vy, dst, ncols, nrows, item_ct1);
617
- });
618
  });
619
  }
620
  }
@@ -628,17 +618,12 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
628
  const sycl::range<3> block_nums(1, 1, block_num_y);
629
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
630
  {
631
-
632
- stream->submit([&](sycl::handler &cgh) {
633
-
634
- cgh.parallel_for(
635
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
636
- [=](sycl::nd_item<3> item_ct1)
637
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
638
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
639
- VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
640
- vx, vy, dst, ncols, nrows, item_ct1);
641
- });
642
  });
643
  }
644
  }
@@ -652,17 +637,12 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
652
  const sycl::range<3> block_nums(1, 1, block_num_y);
653
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
654
  {
655
-
656
- stream->submit([&](sycl::handler &cgh) {
657
-
658
- cgh.parallel_for(
659
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
660
- [=](sycl::nd_item<3> item_ct1)
661
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
662
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
663
- VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
664
- vx, vy, dst, ncols, nrows, item_ct1);
665
- });
666
  });
667
  }
668
  }
@@ -676,17 +656,12 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
676
  const sycl::range<3> block_nums(1, 1, block_num_y);
677
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
678
  {
679
-
680
- stream->submit([&](sycl::handler &cgh) {
681
-
682
- cgh.parallel_for(
683
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
684
- [=](sycl::nd_item<3> item_ct1)
685
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
686
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
687
- VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
688
- vx, vy, dst, ncols, nrows, item_ct1);
689
- });
690
  });
691
  }
692
  }
@@ -700,17 +675,12 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
700
  const sycl::range<3> block_nums(1, 1, block_num_y);
701
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
702
  {
703
-
704
- stream->submit([&](sycl::handler &cgh) {
705
-
706
- cgh.parallel_for(
707
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
708
- [=](sycl::nd_item<3> item_ct1)
709
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
710
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
711
- VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
712
- vx, vy, dst, ncols, nrows, item_ct1);
713
- });
714
  });
715
  }
716
  }
@@ -724,17 +694,12 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
724
  const sycl::range<3> block_nums(1, 1, block_num_y);
725
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
726
  {
727
-
728
- stream->submit([&](sycl::handler &cgh) {
729
-
730
- cgh.parallel_for(
731
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
732
- [=](sycl::nd_item<3> item_ct1)
733
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
734
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
735
- VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
736
- vx, vy, dst, ncols, nrows, item_ct1);
737
- });
738
  });
739
  }
740
  }
@@ -750,12 +715,12 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy,
750
  const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
751
  const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
752
 
753
- stream->submit([&](sycl::handler & cgh) {
754
- cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
755
- [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
756
- mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
757
- nrows, nd_item);
758
- });
759
  });
760
  }
761
 
@@ -769,17 +734,12 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
769
  const sycl::range<3> block_nums(1, 1, block_num_y);
770
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
771
  {
772
-
773
- stream->submit([&](sycl::handler &cgh) {
774
-
775
- cgh.parallel_for(
776
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
777
- [=](sycl::nd_item<3> item_ct1)
778
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
779
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
780
- VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
781
- vx, vy, dst, ncols, nrows, item_ct1);
782
- });
783
  });
784
  }
785
  }
@@ -794,12 +754,12 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy,
794
  const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
795
  const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
796
 
797
- stream->submit([&](sycl::handler & cgh) {
798
- cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
799
- [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
800
- mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
801
- nd_item);
802
- });
803
  });
804
  }
805
  static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
@@ -811,17 +771,12 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
811
  const sycl::range<3> block_nums(1, 1, block_num_y);
812
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
813
  {
814
-
815
- stream->submit([&](sycl::handler &cgh) {
816
-
817
- cgh.parallel_for(
818
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
819
- [=](sycl::nd_item<3> item_ct1)
820
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
821
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
822
- VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
823
- vx, vy, dst, ncols, nrows, item_ct1);
824
- });
825
  });
826
  }
827
  }
@@ -836,14 +791,12 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
836
  const sycl::range<3> block_nums(1, 1, block_num_y);
837
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
838
  {
839
- stream->submit([&](sycl::handler &cgh) {
840
- cgh.parallel_for(
841
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
842
- [=](sycl::nd_item<3> item_ct1)
843
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
844
- mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
845
- vx, vy, dst, ncols, nrows, item_ct1);
846
- });
847
  });
848
  }
849
  }
@@ -857,14 +810,12 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
857
  const sycl::range<3> block_nums(1, 1, block_num_y);
858
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
859
  {
860
- stream->submit([&](sycl::handler & cgh) {
861
- cgh.parallel_for(
862
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
863
- [=](sycl::nd_item<3> item_ct1)
864
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
865
- mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
866
- vx, vy, dst, ncols, nrows, item_ct1);
867
- });
868
  });
869
  }
870
  }
@@ -878,15 +829,12 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
878
  const sycl::range<3> block_nums(1, 1, block_num_y);
879
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
880
  {
881
-
882
- stream->submit([&](sycl::handler &cgh) {
883
- cgh.parallel_for(
884
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
885
- [=](sycl::nd_item<3> item_ct1)
886
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
887
- mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
888
- vx, vy, dst, ncols, nrows, item_ct1);
889
- });
890
  });
891
  }
892
  }
@@ -900,15 +848,12 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
900
  const sycl::range<3> block_nums(1, 1, block_num_y);
901
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
902
  {
903
-
904
- stream->submit([&](sycl::handler &cgh) {
905
- cgh.parallel_for(
906
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
907
- [=](sycl::nd_item<3> item_ct1)
908
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
909
- mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
910
- vx, vy, dst, ncols, nrows, item_ct1);
911
- });
912
  });
913
  }
914
  }
@@ -922,15 +867,12 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
922
  const sycl::range<3> block_nums(1, 1, block_num_y);
923
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
924
  {
925
-
926
- stream->submit([&](sycl::handler &cgh) {
927
- cgh.parallel_for(
928
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
929
- [=](sycl::nd_item<3> item_ct1)
930
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
931
- mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
932
- vx, vy, dst, ncols, nrows, item_ct1);
933
- });
934
  });
935
  }
936
  }
@@ -944,15 +886,12 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
944
  const sycl::range<3> block_nums(1, 1, block_num_y);
945
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
946
  {
947
-
948
- stream->submit([&](sycl::handler &cgh) {
949
- cgh.parallel_for(
950
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
951
- [=](sycl::nd_item<3> item_ct1)
952
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
953
- mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
954
- vx, vy, dst, ncols, nrows, item_ct1);
955
- });
956
  });
957
  }
958
  }
@@ -966,14 +905,12 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
966
  const sycl::range<3> block_nums(1, 1, block_num_y);
967
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
968
  {
969
- stream->submit([&](sycl::handler &cgh) {
970
- cgh.parallel_for(
971
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
972
- [=](sycl::nd_item<3> item_ct1)
973
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
974
- mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
975
- vx, vy, dst, ncols, nrows, item_ct1);
976
- });
977
  });
978
  }
979
  }
@@ -987,15 +924,12 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
987
  const sycl::range<3> block_nums(1, 1, block_num_y);
988
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
989
  {
990
-
991
- stream->submit([&](sycl::handler &cgh) {
992
- cgh.parallel_for(
993
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
994
- [=](sycl::nd_item<3> item_ct1)
995
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
996
- mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
997
- vx, vy, dst, ncols, nrows, item_ct1);
998
- });
999
  });
1000
  }
1001
  }
@@ -1009,15 +943,12 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
1009
  const sycl::range<3> block_nums(1, 1, block_num_y);
1010
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
1011
  {
1012
-
1013
- stream->submit([&](sycl::handler &cgh) {
1014
- cgh.parallel_for(
1015
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1016
- [=](sycl::nd_item<3> item_ct1)
1017
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1018
- mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
1019
- vx, vy, dst, ncols, nrows, item_ct1);
1020
- });
1021
  });
1022
  }
1023
  }
 
544
  const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
545
  const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
546
 
547
+ sycl_launch(stream, [&](sycl::handler & cgh) {
548
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
549
+ [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
550
+ mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
551
+ nd_item);
552
+ });
553
  });
554
  }
555
 
 
561
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
562
 
563
  {
564
+ sycl_launch(stream, [&](sycl::handler & cgh) {
565
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
566
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
567
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
568
+ vx, vy, dst, ncols, nrows, item_ct1);
569
+ });
570
  });
571
  }
572
  }
 
580
  const sycl::range<3> block_nums(1, 1, block_num_y);
581
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
582
  {
583
+ sycl_launch(stream, [&](sycl::handler & cgh) {
584
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
585
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
586
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
587
+ vx, vy, dst, ncols, nrows, item_ct1);
588
+ });
 
 
 
 
 
589
  });
590
  }
591
  }
 
599
  const sycl::range<3> block_nums(1, 1, block_num_y);
600
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
601
  {
602
+ sycl_launch(stream, [&](sycl::handler & cgh) {
603
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
604
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
605
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
606
+ vx, vy, dst, ncols, nrows, item_ct1);
607
+ });
 
 
 
 
 
608
  });
609
  }
610
  }
 
618
  const sycl::range<3> block_nums(1, 1, block_num_y);
619
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
620
  {
621
+ sycl_launch(stream, [&](sycl::handler & cgh) {
622
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
623
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
624
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
625
+ vx, vy, dst, ncols, nrows, item_ct1);
626
+ });
 
 
 
 
 
627
  });
628
  }
629
  }
 
637
  const sycl::range<3> block_nums(1, 1, block_num_y);
638
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
639
  {
640
+ sycl_launch(stream, [&](sycl::handler & cgh) {
641
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
642
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
643
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
644
+ vx, vy, dst, ncols, nrows, item_ct1);
645
+ });
 
 
 
 
 
646
  });
647
  }
648
  }
 
656
  const sycl::range<3> block_nums(1, 1, block_num_y);
657
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
658
  {
659
+ sycl_launch(stream, [&](sycl::handler & cgh) {
660
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
661
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
662
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
663
+ vx, vy, dst, ncols, nrows, item_ct1);
664
+ });
 
 
 
 
 
665
  });
666
  }
667
  }
 
675
  const sycl::range<3> block_nums(1, 1, block_num_y);
676
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
677
  {
678
+ sycl_launch(stream, [&](sycl::handler & cgh) {
679
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
680
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
681
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
682
+ vx, vy, dst, ncols, nrows, item_ct1);
683
+ });
 
 
 
 
 
684
  });
685
  }
686
  }
 
694
  const sycl::range<3> block_nums(1, 1, block_num_y);
695
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
696
  {
697
+ sycl_launch(stream, [&](sycl::handler & cgh) {
698
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
699
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
700
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
701
+ vx, vy, dst, ncols, nrows, item_ct1);
702
+ });
 
 
 
 
 
703
  });
704
  }
705
  }
 
715
  const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
716
  const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
717
 
718
+ sycl_launch(stream, [&](sycl::handler & cgh) {
719
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
720
+ [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
721
+ mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols, nrows,
722
+ nd_item);
723
+ });
724
  });
725
  }
726
 
 
734
  const sycl::range<3> block_nums(1, 1, block_num_y);
735
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
736
  {
737
+ sycl_launch(stream, [&](sycl::handler & cgh) {
738
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
739
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
740
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
741
+ vx, vy, dst, ncols, nrows, item_ct1);
742
+ });
 
 
 
 
 
743
  });
744
  }
745
  }
 
754
  const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
755
  const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
756
 
757
+ sycl_launch(stream, [&](sycl::handler & cgh) {
758
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
759
+ [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
760
+ mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
761
+ nd_item);
762
+ });
763
  });
764
  }
765
  static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
 
771
  const sycl::range<3> block_nums(1, 1, block_num_y);
772
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
773
  {
774
+ sycl_launch(stream, [&](sycl::handler & cgh) {
775
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
776
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
777
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
778
+ vx, vy, dst, ncols, nrows, item_ct1);
779
+ });
 
 
 
 
 
780
  });
781
  }
782
  }
 
791
  const sycl::range<3> block_nums(1, 1, block_num_y);
792
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
793
  {
794
+ sycl_launch(stream, [&](sycl::handler & cgh) {
795
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
796
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
797
+ mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS / 2, block_iq2_xxs, 1>(vx, vy, dst, ncols,
798
+ nrows, item_ct1);
799
+ });
 
 
800
  });
801
  }
802
  }
 
810
  const sycl::range<3> block_nums(1, 1, block_num_y);
811
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
812
  {
813
+ sycl_launch(stream, [&](sycl::handler & cgh) {
814
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
815
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
816
+ mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS / 2, block_iq2_xs, 1>(vx, vy, dst, ncols,
817
+ nrows, item_ct1);
818
+ });
 
 
819
  });
820
  }
821
  }
 
829
  const sycl::range<3> block_nums(1, 1, block_num_y);
830
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
831
  {
832
+ sycl_launch(stream, [&](sycl::handler & cgh) {
833
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
834
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
835
+ mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S / 2, block_iq2_s, 1>(vx, vy, dst, ncols, nrows,
836
+ item_ct1);
837
+ });
 
 
 
838
  });
839
  }
840
  }
 
848
  const sycl::range<3> block_nums(1, 1, block_num_y);
849
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
850
  {
851
+ sycl_launch(stream, [&](sycl::handler & cgh) {
852
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
853
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
854
+ mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS / 2, block_iq3_xxs, 1>(vx, vy, dst, ncols,
855
+ nrows, item_ct1);
856
+ });
 
 
 
857
  });
858
  }
859
  }
 
867
  const sycl::range<3> block_nums(1, 1, block_num_y);
868
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
869
  {
870
+ sycl_launch(stream, [&](sycl::handler & cgh) {
871
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
872
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
873
+ mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S / 2, block_iq3_s, 1>(vx, vy, dst, ncols, nrows,
874
+ item_ct1);
875
+ });
 
 
 
876
  });
877
  }
878
  }
 
886
  const sycl::range<3> block_nums(1, 1, block_num_y);
887
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
888
  {
889
+ sycl_launch(stream, [&](sycl::handler & cgh) {
890
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
891
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
892
+ mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(vx, vy, dst, ncols, nrows,
893
+ item_ct1);
894
+ });
 
 
 
895
  });
896
  }
897
  }
 
905
  const sycl::range<3> block_nums(1, 1, block_num_y);
906
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
907
  {
908
+ sycl_launch(stream, [&](sycl::handler & cgh) {
909
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
910
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
911
+ mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(vx, vy, dst, ncols, nrows,
912
+ item_ct1);
913
+ });
 
 
914
  });
915
  }
916
  }
 
924
  const sycl::range<3> block_nums(1, 1, block_num_y);
925
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
926
  {
927
+ sycl_launch(stream, [&](sycl::handler & cgh) {
928
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
929
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
930
+ mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(vx, vy, dst, ncols, nrows,
931
+ item_ct1);
932
+ });
 
 
 
933
  });
934
  }
935
  }
 
943
  const sycl::range<3> block_nums(1, 1, block_num_y);
944
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
945
  {
946
+ sycl_launch(stream, [&](sycl::handler & cgh) {
947
+ sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
948
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
949
+ mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS / 4, block_iq4_xs, 1>(vx, vy, dst, ncols,
950
+ nrows, item_ct1);
951
+ });
 
 
 
952
  });
953
  }
954
  }
ggml/src/ggml-sycl/norm.cpp CHANGED
@@ -254,14 +254,13 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
254
  GGML_ASSERT(ncols % WARP_SIZE == 0);
255
  if (ncols < 1024) {
256
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
257
- stream->submit([&](sycl::handler& cgh) {
258
- cgh.parallel_for(
259
- sycl::nd_range<3>(global_dims * block_dims, block_dims),
260
- [=](sycl::nd_item<3> item_ct1)
261
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
262
- norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
263
- });
264
- });
265
  }
266
  else {
267
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -272,16 +271,15 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
272
  the limit. To get the device limit, query
273
  info::device::max_work_group_size. Adjust the work-group size if needed.
274
  */
275
- stream->submit([&](sycl::handler& cgh) {
276
  sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
277
  sycl::range<1>(work_group_size / WARP_SIZE), cgh);
278
- cgh.parallel_for(
279
- sycl::nd_range<3>(global_dims * block_dims, block_dims),
280
- [=](sycl::nd_item<3> item_ct1)
281
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
282
- norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
283
- });
284
- });
285
  }
286
  }
287
 
@@ -290,18 +288,14 @@ static void group_norm_f32_sycl(const float* x, float* dst,
290
  const int ne_elements, queue_ptr stream, int device) {
291
  if (group_size < 1024) {
292
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
293
- stream->submit([&](sycl::handler& cgh) {
294
  const float eps_ct4 = eps;
295
- cgh.parallel_for(
296
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
297
- block_dims),
298
- [=](sycl::nd_item<3> item_ct1)
299
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
300
- group_norm_f32(
301
- x, dst, group_size, ne_elements, eps_ct4, item_ct1,
302
- nullptr, WARP_SIZE);
303
- });
304
- });
305
  }
306
  else {
307
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -313,22 +307,18 @@ static void group_norm_f32_sycl(const float* x, float* dst,
313
  info::device::max_work_group_size. Adjust the work-group size if needed.
314
  */
315
 
316
- stream->submit([&](sycl::handler& cgh) {
317
  sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
318
  cgh);
319
 
320
  const float eps_ct4 = eps;
321
 
322
- cgh.parallel_for(
323
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
324
- block_dims),
325
- [=](sycl::nd_item<3> item_ct1)
326
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
327
- group_norm_f32(x, dst, group_size, ne_elements,
328
- eps_ct4, item_ct1,
329
- get_pointer(s_sum_acc_ct1), work_group_size);
330
- });
331
- });
332
  }
333
  }
334
 
@@ -340,14 +330,13 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
340
  const sycl::range<3> global_dims(nsamples, nchannels, nrows);
341
  if (ncols < 1024) {
342
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
343
- stream->submit([&](sycl::handler& cgh) {
344
- cgh.parallel_for(
345
- sycl::nd_range<3>(global_dims * block_dims, block_dims),
346
- [=](sycl::nd_item<3> item_ct1)
347
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
348
- rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
349
- });
350
- });
351
  }
352
  else {
353
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -358,16 +347,15 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
358
  the limit. To get the device limit, query
359
  info::device::max_work_group_size. Adjust the work-group size if needed.
360
  */
361
- stream->submit([&](sycl::handler& cgh) {
362
  sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
363
  cgh);
364
- cgh.parallel_for(
365
- sycl::nd_range<3>(global_dims * block_dims, block_dims),
366
- [=](sycl::nd_item<3> item_ct1)
367
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
368
- rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
369
- });
370
- });
371
  }
372
  }
373
 
@@ -378,16 +366,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
378
  // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
379
  if (ncols < 1024) {
380
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
381
- stream->submit([&](sycl::handler& cgh) {
382
- cgh.parallel_for(
383
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
384
- block_dims),
385
- [=](sycl::nd_item<3> item_ct1)
386
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
387
- l2_norm_f32(x, dst, ncols, eps, item_ct1,
388
- nullptr, WARP_SIZE);
389
- });
390
- });
391
  }
392
  else {
393
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -398,18 +382,15 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
398
  the limit. To get the device limit, query
399
  info::device::max_work_group_size. Adjust the work-group size if needed.
400
  */
401
- stream->submit([&](sycl::handler& cgh) {
402
  sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
403
  cgh);
404
- cgh.parallel_for(
405
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
406
- block_dims),
407
- [=](sycl::nd_item<3> item_ct1)
408
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
409
- l2_norm_f32(x, dst, ncols, eps, item_ct1,
410
- get_pointer(s_sum_acc_ct1), work_group_size);
411
- });
412
- });
413
  }
414
  }
415
 
 
254
  GGML_ASSERT(ncols % WARP_SIZE == 0);
255
  if (ncols < 1024) {
256
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
257
+ sycl_launch(stream, [&](sycl::handler & cgh) {
258
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
259
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
260
+ norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
261
+ nullptr, WARP_SIZE);
262
+ });
263
+ });
 
264
  }
265
  else {
266
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
 
271
  the limit. To get the device limit, query
272
  info::device::max_work_group_size. Adjust the work-group size if needed.
273
  */
274
+ sycl_launch(stream, [&](sycl::handler & cgh) {
275
  sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
276
  sycl::range<1>(work_group_size / WARP_SIZE), cgh);
277
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
278
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
279
+ norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
280
+ get_pointer(s_sum_acc_ct1), work_group_size);
281
+ });
282
+ });
 
283
  }
284
  }
285
 
 
288
  const int ne_elements, queue_ptr stream, int device) {
289
  if (group_size < 1024) {
290
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
291
+ sycl_launch(stream, [&](sycl::handler & cgh) {
292
  const float eps_ct4 = eps;
293
+ sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
294
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
295
+ group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, nullptr,
296
+ WARP_SIZE);
297
+ });
298
+ });
 
 
 
 
299
  }
300
  else {
301
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
 
307
  info::device::max_work_group_size. Adjust the work-group size if needed.
308
  */
309
 
310
+ sycl_launch(stream, [&](sycl::handler & cgh) {
311
  sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
312
  cgh);
313
 
314
  const float eps_ct4 = eps;
315
 
316
+ sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
317
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
318
+ group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1,
319
+ get_pointer(s_sum_acc_ct1), work_group_size);
320
+ });
321
+ });
 
 
 
 
322
  }
323
  }
324
 
 
330
  const sycl::range<3> global_dims(nsamples, nchannels, nrows);
331
  if (ncols < 1024) {
332
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
333
+ sycl_launch(stream, [&](sycl::handler & cgh) {
334
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
335
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
336
+ rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
337
+ nullptr, WARP_SIZE);
338
+ });
339
+ });
 
340
  }
341
  else {
342
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
 
347
  the limit. To get the device limit, query
348
  info::device::max_work_group_size. Adjust the work-group size if needed.
349
  */
350
+ sycl_launch(stream, [&](sycl::handler & cgh) {
351
  sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
352
  cgh);
353
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
354
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
355
+ rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
356
+ get_pointer(s_sum_acc_ct1), work_group_size);
357
+ });
358
+ });
 
359
  }
360
  }
361
 
 
366
  // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
367
  if (ncols < 1024) {
368
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
369
+ sycl_launch(stream, [&](sycl::handler & cgh) {
370
+ sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
371
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
372
+ l2_norm_f32(x, dst, ncols, eps, item_ct1, nullptr, WARP_SIZE);
373
+ });
374
+ });
 
 
 
 
375
  }
376
  else {
377
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
 
382
  the limit. To get the device limit, query
383
  info::device::max_work_group_size. Adjust the work-group size if needed.
384
  */
385
+ sycl_launch(stream, [&](sycl::handler & cgh) {
386
  sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
387
  cgh);
388
+ sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
389
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
390
+ l2_norm_f32(x, dst, ncols, eps, item_ct1, get_pointer(s_sum_acc_ct1),
391
+ work_group_size);
392
+ });
393
+ });
 
 
 
394
  }
395
  }
396
 
ggml/src/ggml-sycl/rope.cpp CHANGED
@@ -235,20 +235,22 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
235
  the limit. To get the device limit, query
236
  info::device::max_work_group_size. Adjust the work-group size if needed.
237
  */
238
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
239
- rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
240
- theta_scale, freq_factors, item_ct1);
241
- });
 
242
  } else {
243
  /*
244
  DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
245
  the limit. To get the device limit, query
246
  info::device::max_work_group_size. Adjust the work-group size if needed.
247
  */
248
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
249
- rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
250
- theta_scale, freq_factors, item_ct1);
251
- });
 
252
  }
253
  }
254
 
@@ -267,15 +269,17 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
267
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
268
 
269
  if (freq_factors == nullptr) {
270
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
271
- rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
272
- theta_scale, freq_factors, item_ct1);
273
- });
 
274
  } else {
275
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
276
- rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
277
- theta_scale, freq_factors, item_ct1);
278
- });
 
279
  }
280
  }
281
 
@@ -298,12 +302,12 @@ static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1,
298
  }
299
  // launch kernel
300
  if (freq_factors == nullptr) {
301
- stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
302
  rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
303
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
304
  });
305
  } else {
306
- stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
307
  rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
308
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
309
  });
@@ -333,12 +337,12 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
333
  }
334
  // launch kernel
335
  if (freq_factors == nullptr) {
336
- stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
337
  rope_vision<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
338
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
339
  });
340
  } else {
341
- stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
342
  rope_vision<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
343
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
344
  });
 
235
  the limit. To get the device limit, query
236
  info::device::max_work_group_size. Adjust the work-group size if needed.
237
  */
238
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
239
+ [=](sycl::nd_item<3> item_ct1) {
240
+ rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
241
+ attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
242
+ });
243
  } else {
244
  /*
245
  DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
246
  the limit. To get the device limit, query
247
  info::device::max_work_group_size. Adjust the work-group size if needed.
248
  */
249
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
250
+ [=](sycl::nd_item<3> item_ct1) {
251
+ rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
252
+ attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
253
+ });
254
  }
255
  }
256
 
 
269
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
270
 
271
  if (freq_factors == nullptr) {
272
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
273
+ [=](sycl::nd_item<3> item_ct1) {
274
+ rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
275
+ attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
276
+ });
277
  } else {
278
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
279
+ [=](sycl::nd_item<3> item_ct1) {
280
+ rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
281
+ attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
282
+ });
283
  }
284
  }
285
 
 
302
  }
303
  // launch kernel
304
  if (freq_factors == nullptr) {
305
+ sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
306
  rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
307
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
308
  });
309
  } else {
310
+ sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
311
  rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
312
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
313
  });
 
337
  }
338
  // launch kernel
339
  if (freq_factors == nullptr) {
340
+ sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
341
  rope_vision<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
342
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
343
  });
344
  } else {
345
+ sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
346
  rope_vision<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
347
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
348
  });
ggml/src/ggml-sycl/softmax.cpp CHANGED
@@ -127,11 +127,11 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst,
127
  const int nrows_y, const float scale, const float max_bias, const float m0,
128
  const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
129
  const size_t n_local_scratch, queue_ptr stream) {
130
- stream->submit([&](sycl::handler &cgh) {
131
  sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
132
 
133
- cgh.parallel_for(
134
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
135
  [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
136
  soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
137
  nrows_y, scale, max_bias, m0,
 
127
  const int nrows_y, const float scale, const float max_bias, const float m0,
128
  const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
129
  const size_t n_local_scratch, queue_ptr stream) {
130
+ sycl_launch(stream, [&](sycl::handler & cgh) {
131
  sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
132
 
133
+ sycl_parallel_for(
134
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
135
  [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
136
  soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
137
  nrows_y, scale, max_bias, m0,
ggml/src/ggml-sycl/tsembd.cpp CHANGED
@@ -45,14 +45,9 @@ static void timestep_embedding_f32_sycl(
45
  int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
46
  sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
47
  sycl::range<3> gridDim(1, ne00, num_blocks);
48
- stream->parallel_for(
49
- sycl::nd_range<3>(
50
- gridDim * block_dims, block_dims),
51
- [=](sycl::nd_item<3> item_ct1) {
52
- timestep_embedding_f32(
53
- x, dst, nb1, dim, max_period, item_ct1
54
- );
55
- });
56
  }
57
 
58
  void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
45
  int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
46
  sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
47
  sycl::range<3> gridDim(1, ne00, num_blocks);
48
+ sycl_parallel_for(stream, sycl::nd_range<3>(gridDim * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
49
+ timestep_embedding_f32(x, dst, nb1, dim, max_period, item_ct1);
50
+ });
 
 
 
 
 
51
  }
52
 
53
  void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml/src/ggml-sycl/wkv.cpp CHANGED
@@ -207,12 +207,11 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
207
 
208
  // Submit kernel
209
  if (C / H == WKV_BLOCK_SIZE) {
210
- stream->submit([&](sycl::handler& cgh) {
211
  sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
212
 
213
- cgh.parallel_for(
214
- sycl::nd_range<3>(grid_dims * block_dims, block_dims),
215
- [=](sycl::nd_item<3> item_ct1) {
216
  rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE>(
217
  B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
218
  item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
@@ -220,12 +219,11 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
220
  });
221
  });
222
  } else {
223
- stream->submit([&](sycl::handler& cgh) {
224
  sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
225
 
226
- cgh.parallel_for(
227
- sycl::nd_range<3>(grid_dims * block_dims, block_dims),
228
- [=](sycl::nd_item<3> item_ct1) {
229
  rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE * 2>(
230
  B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
231
  item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
@@ -264,12 +262,11 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
264
 
265
  // Submit kernel
266
  if (C / H == WKV_BLOCK_SIZE) {
267
- stream->submit([&](sycl::handler& cgh) {
268
  sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
269
 
270
- cgh.parallel_for(
271
- sycl::nd_range<3>(grid_dims * block_dims, block_dims),
272
- [=](sycl::nd_item<3> item_ct1) {
273
  rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE>(
274
  B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d,
275
  item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
@@ -277,12 +274,11 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
277
  });
278
  });
279
  } else {
280
- stream->submit([&](sycl::handler& cgh) {
281
  sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
282
 
283
- cgh.parallel_for(
284
- sycl::nd_range<3>(grid_dims * block_dims, block_dims),
285
- [=](sycl::nd_item<3> item_ct1) {
286
  rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE * 2>(
287
  B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d,
288
  item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
 
207
 
208
  // Submit kernel
209
  if (C / H == WKV_BLOCK_SIZE) {
210
+ sycl_launch(stream, [&](sycl::handler & cgh) {
211
  sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
212
 
213
+ sycl_parallel_for(
214
+ cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
215
  rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE>(
216
  B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
217
  item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
 
219
  });
220
  });
221
  } else {
222
+ sycl_launch(stream, [&](sycl::handler & cgh) {
223
  sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
224
 
225
+ sycl_parallel_for(
226
+ cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
227
  rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE * 2>(
228
  B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
229
  item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
 
262
 
263
  // Submit kernel
264
  if (C / H == WKV_BLOCK_SIZE) {
265
+ sycl_launch(stream, [&](sycl::handler & cgh) {
266
  sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
267
 
268
+ sycl_parallel_for(
269
+ cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
270
  rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE>(
271
  B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d,
272
  item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
 
274
  });
275
  });
276
  } else {
277
+ sycl_launch(stream, [&](sycl::handler & cgh) {
278
  sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
279
 
280
+ sycl_parallel_for(
281
+ cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 
282
  rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE * 2>(
283
  B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d,
284
  item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()