Neo Zhang Jianyu arthw commited on
Commit
14fd317
·
1 Parent(s): 310a36c

Optimize mul_mat for Q4_0 on Intel GPU (llama/12035)

Browse files

* opt performance by reorder for Intel GPU

* detect hw type and save opt feature, and print opt feature

* correct name

* support optimize graph once when compute graph, record the opt status in tensor->extra, make CI passed

* add env variable GGML_SYCL_DISABLE_OPT for debug

* use syclex::architecture replace the custom hw define, update the guide for GGML_SYCL_DISABLE_OPT

* add performance data

* mv getrows functions to separeted files

* fix global variables

---------

Co-authored-by: arthw <[email protected]>

ggml/src/ggml-sycl/CMakeLists.txt CHANGED
@@ -1,3 +1,5 @@
 
 
1
  if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
2
  message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
3
  endif()
 
1
+ message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
2
+
3
  if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
4
  message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
5
  endif()
ggml/src/ggml-sycl/common.cpp CHANGED
@@ -99,3 +99,20 @@ catch (sycl::exception const &exc) {
99
  << ", line:" << __LINE__ << std::endl;
100
  std::exit(1);
101
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  << ", line:" << __LINE__ << std::endl;
100
  std::exit(1);
101
  }
102
+
103
+
104
+ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
105
+ for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
106
+ for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
107
+ if (extra->events[i][is] != nullptr) {
108
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is])));
109
+ }
110
+ }
111
+ if (extra->data_device[i] != nullptr && streams.size()>0) {
112
+ ggml_sycl_set_device(i);
113
+ SYCL_CHECK(
114
+ CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
115
+ }
116
+ }
117
+ delete extra;
118
+ }
ggml/src/ggml-sycl/common.hpp CHANGED
@@ -19,6 +19,9 @@
19
  #include "dpct/helper.hpp"
20
  #include "ggml-sycl.h"
21
  #include "presets.hpp"
 
 
 
22
  #if GGML_SYCL_DNNL
23
  #include "dnnl.hpp"
24
  #include "dnnl_sycl.hpp"
@@ -35,7 +38,10 @@
35
  void* ggml_sycl_host_malloc(size_t size);
36
  void ggml_sycl_host_free(void* ptr);
37
 
 
38
  extern int g_ggml_sycl_debug;
 
 
39
  #define GGML_SYCL_DEBUG(...) \
40
  do { \
41
  if (g_ggml_sycl_debug) \
@@ -182,18 +188,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
182
  }
183
 
184
  //////////////////////
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  struct ggml_sycl_device_info {
187
  int device_count;
188
 
189
- struct sycl_device_info {
190
- int cc; // compute capability
191
- // int nsm; // number of streaming multiprocessors
192
- // size_t smpb; // max. shared memory per block
193
- bool vmm; // virtual memory support
194
- size_t total_vram;
195
- };
196
-
197
  sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
198
 
199
  std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
@@ -260,17 +272,46 @@ struct ggml_tensor_extra_gpu {
260
  // tensors
261
  dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
262
  [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
 
263
  };
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  struct ggml_backend_sycl_context {
266
  int device;
267
  std::string name;
 
 
268
 
269
  queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
270
 
271
  explicit ggml_backend_sycl_context(int device) :
272
  device(device),
273
  name(GGML_SYCL_NAME + std::to_string(device)) {
 
274
  }
275
 
276
  queue_ptr stream(int device, int stream) {
@@ -680,5 +721,4 @@ bool gpu_has_xmx(sycl::device &dev);
680
  void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
681
  const ggml_tensor *src1, ggml_tensor *dst,
682
  const ggml_sycl_op_flatten_t op);
683
-
684
  #endif // GGML_SYCL_COMMON_HPP
 
19
  #include "dpct/helper.hpp"
20
  #include "ggml-sycl.h"
21
  #include "presets.hpp"
22
+ #include "sycl_hw.hpp"
23
+
24
+
25
  #if GGML_SYCL_DNNL
26
  #include "dnnl.hpp"
27
  #include "dnnl_sycl.hpp"
 
38
  void* ggml_sycl_host_malloc(size_t size);
39
  void ggml_sycl_host_free(void* ptr);
40
 
41
+
42
  extern int g_ggml_sycl_debug;
43
+ extern int g_ggml_sycl_disable_optimize;
44
+
45
  #define GGML_SYCL_DEBUG(...) \
46
  do { \
47
  if (g_ggml_sycl_debug) \
 
188
  }
189
 
190
  //////////////////////
191
+ struct optimize_feature {
192
+ bool reorder=false;
193
+ };
194
+
195
+ struct sycl_device_info {
196
+ int cc; // compute capability
197
+ // int nsm; // number of streaming multiprocessors
198
+ // size_t smpb; // max. shared memory per block
199
+ bool vmm; // virtual memory support
200
+ size_t total_vram;
201
+ sycl_hw_info hw_info;
202
+ optimize_feature opt_feature;
203
+ };
204
+
205
 
206
  struct ggml_sycl_device_info {
207
  int device_count;
208
 
 
 
 
 
 
 
 
 
209
  sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
210
 
211
  std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
 
272
  // tensors
273
  dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
274
  [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
275
+ optimize_feature optimized_feature;
276
  };
277
 
278
+ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
279
+
280
+ inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
281
+ optimize_feature opt;
282
+
283
+ opt.reorder =
284
+ (arch == syclex::architecture::intel_gpu_dg1 ||
285
+ arch == syclex::architecture::intel_gpu_acm_g10 ||
286
+ arch == syclex::architecture::intel_gpu_acm_g11 ||
287
+ arch == syclex::architecture::intel_gpu_acm_g12 ||
288
+ arch == syclex::architecture::intel_gpu_pvc ||
289
+ arch == syclex::architecture::intel_gpu_pvc_vg ||
290
+ arch == syclex::architecture::intel_gpu_mtl_u ||
291
+ arch == syclex::architecture::intel_gpu_mtl_s ||
292
+ arch == syclex::architecture::intel_gpu_mtl_h ||
293
+ arch == syclex::architecture::intel_gpu_arl_u ||
294
+ arch == syclex::architecture::intel_gpu_arl_s ||
295
+ arch == syclex::architecture::intel_gpu_arl_h ||
296
+ arch == syclex::architecture::intel_gpu_bmg_g21 ||
297
+ arch == syclex::architecture::intel_gpu_lnl_m
298
+ );
299
+
300
+ return opt;
301
+ }
302
+
303
  struct ggml_backend_sycl_context {
304
  int device;
305
  std::string name;
306
+ optimize_feature opt_feature;
307
+ bool optimized_graph=false;
308
 
309
  queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
310
 
311
  explicit ggml_backend_sycl_context(int device) :
312
  device(device),
313
  name(GGML_SYCL_NAME + std::to_string(device)) {
314
+ opt_feature = ggml_sycl_info().devices[device].opt_feature;
315
  }
316
 
317
  queue_ptr stream(int device, int stream) {
 
721
  void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
722
  const ggml_tensor *src1, ggml_tensor *dst,
723
  const ggml_sycl_op_flatten_t op);
 
724
  #endif // GGML_SYCL_COMMON_HPP
ggml/src/ggml-sycl/convert.cpp CHANGED
@@ -125,6 +125,25 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
125
  }
126
  }
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  template <typename dst_t>
129
  static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
130
  dpct::queue_ptr stream) {
@@ -452,10 +471,15 @@ static void convert_unary_sycl(const void *__restrict__ vx,
452
  }
453
  }
454
 
455
- to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
456
  switch (type) {
457
  case GGML_TYPE_Q4_0:
458
- return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
 
 
 
 
 
459
  case GGML_TYPE_Q4_1:
460
  return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
461
  case GGML_TYPE_Q5_0:
@@ -499,10 +523,15 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
499
  }
500
  }
501
 
502
- to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
503
  switch (type) {
504
  case GGML_TYPE_Q4_0:
505
- return dequantize_row_q4_0_sycl;
 
 
 
 
 
506
  case GGML_TYPE_Q4_1:
507
  return dequantize_row_q4_1_sycl;
508
  case GGML_TYPE_Q5_0:
 
125
  }
126
  }
127
 
128
+ template <typename dst_t>
129
+ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
130
+ dpct::queue_ptr stream) {
131
+
132
+ dpct::has_capability_or_fail(stream->get_device(),
133
+ {sycl::aspect::fp16});
134
+
135
+ int constexpr WARP_K = WARP_SIZE * QK4_0;
136
+ const int n_warp = (k + WARP_K - 1) / WARP_K;
137
+ GGML_ASSERT(k % 2 == 0);
138
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
139
+ sycl::range<3>(1, 1, WARP_SIZE),
140
+ sycl::range<3>(1, 1, WARP_SIZE)),
141
+ [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
142
+ dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
143
+ });
144
+
145
+ }
146
+
147
  template <typename dst_t>
148
  static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
149
  dpct::queue_ptr stream) {
 
471
  }
472
  }
473
 
474
+ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
475
  switch (type) {
476
  case GGML_TYPE_Q4_0:
477
+ if (dst->src[0]->extra &&
478
+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
479
+ return dequantize_row_q4_0_sycl_reorder;
480
+ } else {
481
+ return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
482
+ }
483
  case GGML_TYPE_Q4_1:
484
  return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
485
  case GGML_TYPE_Q5_0:
 
523
  }
524
  }
525
 
526
+ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
527
  switch (type) {
528
  case GGML_TYPE_Q4_0:
529
+ if (dst->src[0]->extra &&
530
+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
531
+ return dequantize_row_q4_0_sycl_reorder;
532
+ } else {
533
+ return dequantize_row_q4_0_sycl;
534
+ }
535
  case GGML_TYPE_Q4_1:
536
  return dequantize_row_q4_1_sycl;
537
  case GGML_TYPE_Q5_0:
ggml/src/ggml-sycl/convert.hpp CHANGED
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
21
  typedef to_t_sycl_t<float> to_fp32_sycl_t;
22
  typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
23
 
24
- to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
25
- to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
26
 
27
  #endif // GGML_SYCL_CONVERT_HPP
 
21
  typedef to_t_sycl_t<float> to_fp32_sycl_t;
22
  typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
23
 
24
+ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
25
+ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
26
 
27
  #endif // GGML_SYCL_CONVERT_HPP
ggml/src/ggml-sycl/dequantize.hpp CHANGED
@@ -16,6 +16,8 @@
16
  #include "common.hpp"
17
 
18
  typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
 
 
19
 
20
  static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
21
  const int iqs, dfloat2 &v) {
@@ -40,6 +42,29 @@ static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
40
  #endif // GGML_SYCL_F16
41
  }
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
44
  const int iqs, dfloat2 &v) {
45
  const block_q4_1 * x = (const block_q4_1 *) vx;
@@ -167,6 +192,36 @@ static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restri
167
  }
168
  }
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  template<typename dst_t>
171
  static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
172
  const sycl::nd_item<3> &item_ct1) {
 
16
  #include "common.hpp"
17
 
18
  typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
19
+ typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs,
20
+ const int iqs, dfloat2 &v);
21
 
22
  static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
23
  const int iqs, dfloat2 &v) {
 
42
  #endif // GGML_SYCL_F16
43
  }
44
 
45
+ static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
46
+ const int iqs, dfloat2 &v) {
47
+ // const block_q4_0 * x = (const block_q4_0 *) vx;
48
+
49
+ const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
50
+
51
+ const int vui = *((const uint8_t *)qs+iqs);
52
+
53
+ v.x() = vui & 0xF;
54
+ v.y() = vui >> 4;
55
+
56
+ #ifdef GGML_SYCL_F16
57
+ // v = v - {8.0f, 8.0f};
58
+ // v = v * {d, d};
59
+ v.s0() = (v.s0() - 8.0f) * d;
60
+ v.s1() = (v.s1() - 8.0f) * d;
61
+
62
+ #else
63
+ v.x() = (v.x() - 8.0f) * d;
64
+ v.y() = (v.y() - 8.0f) * d;
65
+ #endif // GGML_SYCL_F16
66
+ }
67
+
68
  static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
69
  const int iqs, dfloat2 &v) {
70
  const block_q4_1 * x = (const block_q4_1 *) vx;
 
192
  }
193
  }
194
 
195
+ template<typename dst_t>
196
+ static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
197
+ const sycl::nd_item<3> &item_ct1) {
198
+
199
+ const int64_t i = item_ct1.get_group(2);
200
+ auto k=nb32;
201
+ // assume 32 threads
202
+ const int64_t tid = item_ct1.get_local_id(2);
203
+ const int lane_ib = i * WARP_SIZE + tid;
204
+
205
+ if (lane_ib >= k / QK4_0) {
206
+ return;
207
+ }
208
+
209
+ dst_t * y_ptr = yy + lane_ib * QK4_0;
210
+
211
+ auto qs = (const uint8_t*)vx + lane_ib * QK4_0 / 2;
212
+ auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k / 2) + lane_ib;
213
+
214
+ const float d = float(*s_ptr);
215
+
216
+ #pragma unroll
217
+ for (int l = 0; l < QK4_0 / 2; ++l) {
218
+ int vq = qs[l];
219
+ y_ptr[l + 0] = d * ((vq & 0xF) - 8);
220
+ y_ptr[l + 16] = d * ((vq >> 4) - 8);
221
+ }
222
+
223
+ }
224
+
225
  template<typename dst_t>
226
  static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
227
  const sycl::nd_item<3> &item_ct1) {
ggml/src/ggml-sycl/dmmv.cpp CHANGED
@@ -3,7 +3,6 @@
3
  #include "dequantize.hpp"
4
  #include "presets.hpp"
5
 
6
-
7
  static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
8
  const sycl::half *x = (const sycl::half *)vx;
9
 
@@ -91,6 +90,112 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
91
  }
92
  }
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
95
  float *dst, const int ncols,
96
  const int nrows,
@@ -759,6 +864,28 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
759
  }
760
  }
761
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762
 
763
  static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
764
  float *dst, const int ncols,
@@ -953,7 +1080,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
953
 
954
  const int64_t ne00 = src0->ne[0];
955
  const int64_t row_diff = row_high - row_low;
956
-
957
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
958
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
959
  #ifdef GGML_SYCL_F16
@@ -967,7 +1093,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
967
 
968
  if (src1_convert_f16) {
969
  src1_dfloat = src1_dfloat_a.alloc(ne00);
970
- const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
971
  GGML_ASSERT(to_fp16_sycl != nullptr);
972
  to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
973
  }
@@ -977,7 +1103,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
977
 
978
  switch (src0->type) {
979
  case GGML_TYPE_Q4_0:
980
- dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 
 
 
 
 
981
  break;
982
  case GGML_TYPE_Q4_1:
983
  dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
@@ -1020,4 +1151,5 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
1020
  GGML_UNUSED(src1_ddq_i);
1021
  GGML_UNUSED(src1_ncols);
1022
  GGML_UNUSED(src1_padded_row_size);
 
1023
  }
 
3
  #include "dequantize.hpp"
4
  #include "presets.hpp"
5
 
 
6
  static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
7
  const sycl::half *x = (const sycl::half *)vx;
8
 
 
90
  }
91
  }
92
 
93
+ template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_reorder>
94
+ static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
95
+ const sycl::nd_item<3> &item_ct1) {
96
+ // qk = quantized weights per x block
97
+ // qr = number of quantized weights per data value in x block
98
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
99
+ item_ct1.get_local_id(1);
100
+
101
+ if (row >= nrows) {
102
+ return;
103
+ }
104
+
105
+ const int tid = item_ct1.get_local_id(2);
106
+
107
+
108
+ const int ncols_left = ncols % (QK4_0*WARP_SIZE);
109
+ const int ncols_align = ncols - ncols_left;
110
+ const int iter_stride = 8*2*GGML_SYCL_DMMV_X;
111
+ const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter //64/16=4, 512/16/2= 16
112
+ const int y_offset = qr == 1 ? 1 : qk/2;
113
+
114
+ // partial sum for each thread
115
+ #ifdef GGML_SYCL_F16
116
+ sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
117
+ #else
118
+ float tmp = 0.0f;
119
+ #endif // GGML_SYCL_F16
120
+ const char *d_ptr = (const char*)vx+ncols*nrows/2;
121
+ int i=0;
122
+ for (i = 0; i < ncols_align; i += iter_stride) {
123
+ const int col = i + vals_per_iter*tid;
124
+ const int ib = (row*ncols + col)/qk; // x block index
125
+ const int iqs = (col%qk)/qr; // x quant index
126
+ const int iybs = col - col%qk; // y block start index
127
+
128
+ // processing >2 values per i iter is faster for fast GPUs
129
+ #pragma unroll
130
+ for (int j = 0; j < vals_per_iter; j += 2) {
131
+ // process 2 vals per j iter
132
+
133
+ // dequantize
134
+ // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
135
+ dfloat2 v;
136
+ dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
137
+
138
+ // matrix multiplication
139
+ // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
140
+ #ifdef GGML_SYCL_F16
141
+ dfloat2 t1{y[iybs + iqs + j / qr + 0],
142
+ y[iybs + iqs + j / qr + y_offset]};
143
+
144
+ tmp += v * t1;
145
+ #else
146
+ tmp += v.x() * y[iybs + iqs + j / qr + 0];
147
+ tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
148
+ #endif // GGML_SYCL_F16
149
+ }
150
+ }
151
+
152
+ for (; i < ncols; i += iter_stride) {
153
+ if (tid>=ncols_left/QK4_0) continue;
154
+ const int col = i + vals_per_iter*tid;
155
+ const int ib = (row*ncols + col)/qk; // x block index
156
+ const int iqs = (col%qk)/qr; // x quant index
157
+ const int iybs = col - col%qk; // y block start index
158
+
159
+ // processing >2 values per i iter is faster for fast GPUs
160
+ #pragma unroll
161
+ for (int j = 0; j < vals_per_iter; j += 2) {
162
+ // process 2 vals per j iter
163
+
164
+ // dequantize
165
+ // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
166
+ dfloat2 v;
167
+ dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
168
+
169
+ // matrix multiplication
170
+ // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
171
+ #ifdef GGML_SYCL_F16
172
+ dfloat2 t1{y[iybs + iqs + j / qr + 0],
173
+ y[iybs + iqs + j / qr + y_offset]};
174
+
175
+ tmp += v * t1;
176
+ #else
177
+ tmp += v.x() * y[iybs + iqs + j / qr + 0];
178
+ tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
179
+ #endif // GGML_SYCL_F16
180
+ }
181
+ }
182
+
183
+ // sum up partial sums and write back result
184
+ const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
185
+ for (int mask = mask_start; mask > 0; mask >>= 1) {
186
+ tmp +=
187
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
188
+ }
189
+
190
+ if (tid == 0) {
191
+ #ifdef GGML_SYCL_F16
192
+ dst[row] = tmp.x() + tmp.y();
193
+ #else
194
+ dst[row] = tmp;
195
+ #endif // GGML_SYCL_F16
196
+ }
197
+ }
198
+
199
  static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
200
  float *dst, const int ncols,
201
  const int nrows,
 
864
  }
865
  }
866
 
867
+ static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y,
868
+ float *dst, const int ncols,
869
+ const int nrows,
870
+ dpct::queue_ptr stream) {
871
+ GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
872
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
873
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
874
+ const sycl::range<3> block_nums(1, 1, block_num_y);
875
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
876
+ {
877
+ dpct::has_capability_or_fail(stream->get_device(),
878
+ {sycl::aspect::fp16});
879
+
880
+ stream->parallel_for(
881
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
882
+ [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
883
+ dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
884
+ vx, y, dst, ncols, nrows, item_ct1);
885
+ });
886
+ }
887
+ }
888
+
889
 
890
  static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
891
  float *dst, const int ncols,
 
1080
 
1081
  const int64_t ne00 = src0->ne[0];
1082
  const int64_t row_diff = row_high - row_low;
 
1083
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
1084
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
1085
  #ifdef GGML_SYCL_F16
 
1093
 
1094
  if (src1_convert_f16) {
1095
  src1_dfloat = src1_dfloat_a.alloc(ne00);
1096
+ const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
1097
  GGML_ASSERT(to_fp16_sycl != nullptr);
1098
  to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
1099
  }
 
1103
 
1104
  switch (src0->type) {
1105
  case GGML_TYPE_Q4_0:
1106
+ if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
1107
+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
1108
+ dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1109
+ } else {
1110
+ dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1111
+ }
1112
  break;
1113
  case GGML_TYPE_Q4_1:
1114
  dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
 
1151
  GGML_UNUSED(src1_ddq_i);
1152
  GGML_UNUSED(src1_ncols);
1153
  GGML_UNUSED(src1_padded_row_size);
1154
+ GGML_UNUSED(ctx);
1155
  }
ggml/src/ggml-sycl/getrows.cpp ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2024 Intel Corporation
4
+ // SPDX-License-Identifier: MIT
5
+ //
6
+
7
+ //
8
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9
+ // See https://llvm.org/LICENSE.txt for license information.
10
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11
+ //
12
+
13
+ #include "ggml-impl.h"
14
+ #include "common.hpp"
15
+ #include "dequantize.hpp"
16
+ #include "getrows.hpp"
17
+
18
+
19
+ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
20
+ static void k_get_rows(
21
+ const void * src0, const int32_t * src1, dst_t * dst,
22
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
23
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
24
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
25
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
26
+ size_t s10, size_t s11, size_t s12,
27
+ const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
28
+
29
+ const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
30
+ item_ct1.get_local_id(2)) *
31
+ 2;
32
+ const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
33
+ item_ct1.get_local_id(1);
34
+ const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
35
+ item_ct1.get_local_id(0)) /
36
+ ne12;
37
+ const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
38
+ item_ct1.get_local_id(0)) %
39
+ ne12;
40
+
41
+ if (i00 >= ne00) {
42
+ return;
43
+ }
44
+
45
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
46
+
47
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
48
+ const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
49
+
50
+ const int ib = i00/qk; // block index
51
+ const int iqs = (i00%qk)/qr; // quant index
52
+ const int iybs = i00 - i00%qk; // dst block start index
53
+ const int y_offset = qr == 1 ? 1 : qk/2;
54
+
55
+ // dequantize
56
+ dfloat2 v;
57
+ dequantize_kernel(src0_row, ib, iqs, v);
58
+
59
+ dst_row[iybs + iqs + 0] = v.x();
60
+ dst_row[iybs + iqs + y_offset] = v.y();
61
+ }
62
+
63
+ template<int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_recorder, typename dst_t>
64
+ static void k_get_rows_reorder(
65
+ const void * src0, const void *src0_dq, const int32_t * src1, dst_t * dst,
66
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
67
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
68
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
69
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
70
+ size_t s10, size_t s11, size_t s12,
71
+ const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
72
+
73
+ const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
74
+ item_ct1.get_local_id(2)) *
75
+ 2;
76
+ const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
77
+ item_ct1.get_local_id(1);
78
+ const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
79
+ item_ct1.get_local_id(0)) /
80
+ ne12;
81
+ const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
82
+ item_ct1.get_local_id(0)) %
83
+ ne12;
84
+
85
+ if (i00 >= ne00) {
86
+ return;
87
+ }
88
+ auto ncols = ne00;
89
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
90
+
91
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
92
+
93
+ const int src0_off = i01 * ncols + i00;
94
+ const int ib = src0_off / QK4_0; // block index
95
+ const int iqs = (i00%qk)/qr; // x quant index
96
+ const int iybs = i00 - i00%qk; // dst block start index
97
+ const int y_offset = qr == 1 ? 1 : qk/2;
98
+
99
+ // dequantize
100
+ dfloat2 v;
101
+ dequantize_kernel_recorder((const void *)src0_dq, ib, (const void *)src0, src0_off/2, v);
102
+
103
+ dst_row[iybs + iqs + 0] = v.x();
104
+ dst_row[iybs + iqs + y_offset] = v.y();
105
+
106
+ GGML_UNUSED(nb01);
107
+ GGML_UNUSED(nb02);
108
+ GGML_UNUSED(nb03);
109
+ }
110
+
111
+ template<typename src0_t, typename dst_t>
112
+ static void k_get_rows_float(
113
+ const src0_t * src0, const int32_t * src1, dst_t * dst,
114
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
115
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
116
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
117
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
118
+ size_t s10, size_t s11, size_t s12,
119
+ const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
120
+
121
+ const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
122
+ item_ct1.get_local_id(2);
123
+ const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
124
+ item_ct1.get_local_id(1);
125
+ const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
126
+ item_ct1.get_local_id(0)) /
127
+ ne12;
128
+ const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
129
+ item_ct1.get_local_id(0)) %
130
+ ne12;
131
+
132
+ if (i00 >= ne00) {
133
+ return;
134
+ }
135
+
136
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
137
+
138
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
139
+ const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
140
+
141
+ dst_row[i00] = src0_row[i00];
142
+ }
143
+
144
+ template <int qk, int qr, dequantize_kernel_t dq>
145
+ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
146
+ ggml_tensor *dst, const void *src0_dd,
147
+ const int32_t *src1_dd, float *dst_dd,
148
+ queue_ptr stream) {
149
+
150
+ GGML_TENSOR_BINARY_OP_LOCALS
151
+
152
+ const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
153
+ const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
154
+ const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
155
+
156
+ // strides in elements
157
+ //const size_t s0 = nb0 / ggml_element_size(dst);
158
+ const size_t s1 = nb1 / ggml_element_size(dst);
159
+ const size_t s2 = nb2 / ggml_element_size(dst);
160
+ const size_t s3 = nb3 / ggml_element_size(dst);
161
+
162
+ const size_t s10 = nb10 / ggml_element_size(src1);
163
+ const size_t s11 = nb11 / ggml_element_size(src1);
164
+ const size_t s12 = nb12 / ggml_element_size(src1);
165
+ //const size_t s13 = nb13 / ggml_element_size(src1);
166
+
167
+ GGML_ASSERT(ne00 % 2 == 0);
168
+
169
+ stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
170
+ [=](sycl::nd_item<3> item_ct1) {
171
+ k_get_rows<qk, qr, dq>(
172
+ src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
173
+ s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
174
+ });
175
+
176
+ GGML_UNUSED(dst);
177
+ GGML_UNUSED(ctx);
178
+ }
179
+
180
+ template <int qk, int qr, dequantize_kernel_t_reorder dq_reorder>
181
+ static void get_rows_sycl_reorder(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
182
+ ggml_tensor *dst, const void *src0_dd,
183
+ const int32_t *src1_dd, float *dst_dd,
184
+ queue_ptr stream) {
185
+
186
+ GGML_TENSOR_BINARY_OP_LOCALS
187
+
188
+ const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
189
+ const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
190
+ const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
191
+
192
+ // strides in elements
193
+ //const size_t s0 = nb0 / ggml_element_size(dst);
194
+ const size_t s1 = nb1 / ggml_element_size(dst);
195
+ const size_t s2 = nb2 / ggml_element_size(dst);
196
+ const size_t s3 = nb3 / ggml_element_size(dst);
197
+
198
+ const size_t s10 = nb10 / ggml_element_size(src1);
199
+ const size_t s11 = nb11 / ggml_element_size(src1);
200
+ const size_t s12 = nb12 / ggml_element_size(src1);
201
+ //const size_t s13 = nb13 / ggml_element_size(src1);
202
+
203
+ GGML_ASSERT(ne00 % 2 == 0);
204
+
205
+ const uint8_t* src0_q = (const uint8_t*)src0_dd;
206
+ const size_t ncols = ne00;
207
+ const size_t nrows = ne01;
208
+ const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2);
209
+ stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
210
+ [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
211
+ k_get_rows_reorder<qk, qr, dq_reorder>(
212
+ src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2,
213
+ s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
214
+ });
215
+
216
+ GGML_UNUSED(dst);
217
+ GGML_UNUSED(ctx);
218
+ }
219
+
220
+
221
+ template <typename src0_t>
222
+ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
223
+ const ggml_tensor *src1, ggml_tensor *dst,
224
+ const src0_t *src0_dd, const int32_t *src1_dd,
225
+ float *dst_dd, queue_ptr stream) {
226
+
227
+ GGML_TENSOR_BINARY_OP_LOCALS
228
+
229
+ const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
230
+ const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE;
231
+ const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
232
+
233
+ // strides in elements
234
+ //const size_t s0 = nb0 / ggml_element_size(dst);
235
+ const size_t s1 = nb1 / ggml_element_size(dst);
236
+ const size_t s2 = nb2 / ggml_element_size(dst);
237
+ const size_t s3 = nb3 / ggml_element_size(dst);
238
+
239
+ const size_t s10 = nb10 / ggml_element_size(src1);
240
+ const size_t s11 = nb11 / ggml_element_size(src1);
241
+ const size_t s12 = nb12 / ggml_element_size(src1);
242
+ //const size_t s13 = nb13 / ggml_element_size(src1);
243
+
244
+ {
245
+ dpct::has_capability_or_fail(stream->get_device(),
246
+ {sycl::aspect::fp16});
247
+
248
+ stream->parallel_for(
249
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
250
+ [=](sycl::nd_item<3> item_ct1) {
251
+ k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
252
+ s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
253
+ });
254
+ }
255
+
256
+ GGML_UNUSED(dst);
257
+ GGML_UNUSED(ctx);
258
+ }
259
+
260
+ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
261
+ const ggml_tensor *src1, ggml_tensor *dst,
262
+ const float *src0_d, const float *src1_d,
263
+ float *dst_d, const queue_ptr &stream) {
264
+
265
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
266
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
267
+
268
+ GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
269
+ GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
270
+ GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
271
+
272
+ const int32_t * src1_i32 = (const int32_t *) src1_d;
273
+
274
+ switch (src0->type) {
275
+ case GGML_TYPE_F16:
276
+ get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d,
277
+ src1_i32, dst_d, stream);
278
+ break;
279
+ case GGML_TYPE_F32:
280
+ get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
281
+ break;
282
+ case GGML_TYPE_Q4_0:
283
+ if (ctx.opt_feature.reorder && dst->op == GGML_OP_MUL_MAT) {
284
+ get_rows_sycl_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
285
+ } else {
286
+ get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
287
+ }
288
+ break;
289
+ case GGML_TYPE_Q4_1:
290
+ get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
291
+ break;
292
+ case GGML_TYPE_Q5_0:
293
+ get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
294
+ break;
295
+ case GGML_TYPE_Q5_1:
296
+ get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
297
+ break;
298
+ case GGML_TYPE_Q8_0:
299
+ get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
300
+ break;
301
+ default:
302
+ // TODO: k-quants
303
+ GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
304
+ GGML_ABORT("fatal error");
305
+ break;
306
+ }
307
+ }
308
+
ggml/src/ggml-sycl/getrows.hpp ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2024 Intel Corporation
4
+ // SPDX-License-Identifier: MIT
5
+ //
6
+
7
+ //
8
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9
+ // See https://llvm.org/LICENSE.txt for license information.
10
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11
+ //
12
+
13
+ #ifndef GGML_SYCL_GETROWS_HPP
14
+ #define GGML_SYCL_GETROWS_HPP
15
+
16
+ #include "common.hpp"
17
+
18
+ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
19
+ const ggml_tensor *src1, ggml_tensor *dst,
20
+ const float *src0_d, const float *src1_d,
21
+ float *dst_d, const queue_ptr &stream);
22
+
23
+ #endif // GGML_SYCL_GETROWS_HPP
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -39,9 +39,12 @@
39
  #include "ggml-sycl/backend.hpp"
40
  #include "ggml-sycl/presets.hpp"
41
  #include "ggml-sycl/gemm.hpp"
 
 
42
 
43
  static bool g_sycl_loaded = false;
44
  int g_ggml_sycl_debug = 0;
 
45
 
46
  static ggml_sycl_device_info ggml_sycl_init() {
47
  ggml_sycl_device_info info = {};
@@ -64,14 +67,18 @@ static ggml_sycl_device_info ggml_sycl_init() {
64
  for (int i = 0; i < info.device_count; ++i) {
65
  info.devices[i].vmm = 0;
66
  dpct::device_info prop;
 
 
67
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
68
- prop, dpct::dev_mgr::instance().get_device(i))));
69
 
70
  info.default_tensor_split[i] = total_vram;
71
  total_vram += prop.get_global_mem_size();
72
 
73
  info.devices[i].cc =
74
  100 * prop.get_major_version() + 10 * prop.get_minor_version();
 
 
75
 
76
  info.max_work_group_sizes[i] = prop.get_max_work_group_size();
77
  }
@@ -110,6 +117,27 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
110
  global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
111
  }
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  void ggml_backend_sycl_print_sycl_devices() {
114
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
115
  int device_count = dpct::dev_mgr::instance().device_count();
@@ -138,6 +166,8 @@ void ggml_backend_sycl_print_sycl_devices() {
138
  << "]";
139
  print_device_detail(id, device, device_type.str());
140
  }
 
 
141
  }
142
 
143
  static inline int get_sycl_env(const char *env_name, int default_val) {
@@ -159,17 +189,21 @@ static void ggml_check_sycl() try {
159
 
160
  if (!initialized) {
161
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
 
162
  GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
163
- GGML_LOG_INFO("GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
 
 
 
164
  #if defined(GGML_SYCL_FORCE_MMQ)
165
- GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ: yes\n");
166
  #else
167
- GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ: no\n");
168
  #endif
169
  #if defined(GGML_SYCL_F16)
170
- GGML_LOG_INFO("GGML_SYCL_F16: yes\n");
171
  #else
172
- GGML_LOG_INFO("GGML_SYCL_F16: no\n");
173
  #endif
174
 
175
  /* NOT REMOVE, keep it for next optimize for XMX.
@@ -241,19 +275,27 @@ struct ggml_backend_sycl_buffer_context {
241
  void * dev_ptr = nullptr;
242
  queue_ptr stream;
243
  std::string name;
 
 
244
 
245
- ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
246
  device(device), dev_ptr(dev_ptr), stream(stream) {
247
  check_allow_gpu_index(device);
248
  name = (GGML_SYCL_NAME + std::to_string(device));
 
249
  }
250
 
251
-
252
  ~ggml_backend_sycl_buffer_context() {
253
  if (dev_ptr != nullptr) {
254
  ggml_sycl_set_device(device);
255
  SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
256
  }
 
 
 
 
 
 
257
  }
258
  };
259
 
@@ -291,6 +333,9 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
291
  return;
292
  }
293
 
 
 
 
294
 
295
  if (ggml_is_quantized(tensor->type)) {
296
  // initialize padding to 0 to avoid possible NaN values
@@ -316,7 +361,6 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
316
  size_t size) try {
317
 
318
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
319
-
320
  ggml_sycl_set_device(ctx->device);
321
  auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
322
  SYCL_CHECK(
@@ -660,32 +704,7 @@ struct ggml_backend_sycl_split_buffer_type_context {
660
  struct ggml_backend_sycl_split_buffer_context {
661
  ~ggml_backend_sycl_split_buffer_context() try {
662
  for (ggml_tensor_extra_gpu * extra : tensor_extras) {
663
- for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
664
- for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
665
- if (extra->events[i][is] != nullptr) {
666
- /*
667
- DPCT1009:206: SYCL uses exceptions to report errors and
668
- does not use the error codes. The original code was
669
- commented out and a warning string was inserted. You
670
- need to rewrite this code.
671
- */
672
- SYCL_CHECK(CHECK_TRY_ERROR(
673
- dpct::destroy_event(extra->events[i][is])));
674
- }
675
- }
676
- if (extra->data_device[i] != nullptr) {
677
- /*
678
- DPCT1009:207: SYCL uses exceptions to report errors and does
679
- not use the error codes. The original code was commented out
680
- and a warning string was inserted. You need to rewrite this
681
- code.
682
- */
683
- ggml_sycl_set_device(i);
684
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
685
- extra->data_device[i], *(streams[i]))));
686
- }
687
- }
688
- delete extra;
689
  }
690
  }
691
  catch (sycl::exception const &exc) {
@@ -723,7 +742,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
723
  ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
724
 
725
  ctx->tensor_extras.push_back(extra);
726
- ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
727
 
728
  for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
729
  int64_t row_low, row_high;
@@ -1337,83 +1356,6 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
1337
  reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
1338
  }
1339
 
1340
- template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1341
- static void k_get_rows(
1342
- const void * src0, const int32_t * src1, dst_t * dst,
1343
- int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
1344
- /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
1345
- /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
1346
- /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
1347
- size_t s10, size_t s11, size_t s12,
1348
- const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
1349
-
1350
- const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
1351
- item_ct1.get_local_id(2)) *
1352
- 2;
1353
- const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
1354
- item_ct1.get_local_id(1);
1355
- const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
1356
- item_ct1.get_local_id(0)) /
1357
- ne12;
1358
- const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
1359
- item_ct1.get_local_id(0)) %
1360
- ne12;
1361
-
1362
- if (i00 >= ne00) {
1363
- return;
1364
- }
1365
-
1366
- const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
1367
-
1368
- dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
1369
- const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
1370
-
1371
- const int ib = i00/qk; // block index
1372
- const int iqs = (i00%qk)/qr; // quant index
1373
- const int iybs = i00 - i00%qk; // dst block start index
1374
- const int y_offset = qr == 1 ? 1 : qk/2;
1375
-
1376
- // dequantize
1377
- dfloat2 v;
1378
- dequantize_kernel(src0_row, ib, iqs, v);
1379
-
1380
- dst_row[iybs + iqs + 0] = v.x();
1381
- dst_row[iybs + iqs + y_offset] = v.y();
1382
- }
1383
-
1384
- template<typename src0_t, typename dst_t>
1385
- static void k_get_rows_float(
1386
- const src0_t * src0, const int32_t * src1, dst_t * dst,
1387
- int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
1388
- /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
1389
- /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
1390
- /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
1391
- size_t s10, size_t s11, size_t s12,
1392
- const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
1393
-
1394
- const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
1395
- item_ct1.get_local_id(2);
1396
- const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
1397
- item_ct1.get_local_id(1);
1398
- const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
1399
- item_ct1.get_local_id(0)) /
1400
- ne12;
1401
- const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
1402
- item_ct1.get_local_id(0)) %
1403
- ne12;
1404
-
1405
- if (i00 >= ne00) {
1406
- return;
1407
- }
1408
-
1409
- const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
1410
-
1411
- dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
1412
- const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
1413
-
1414
- dst_row[i00] = src0_row[i00];
1415
- }
1416
-
1417
  static void mul_mat_p021_f16_f32(
1418
  const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1419
  const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
@@ -1896,81 +1838,6 @@ static void pool2d_nchw_kernel(
1896
  o_ptr[cur_oh * ow + cur_ow] = res;
1897
  }
1898
 
1899
- template <int qk, int qr, dequantize_kernel_t dq>
1900
- static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
1901
- ggml_tensor *dst, const void *src0_dd,
1902
- const int32_t *src1_dd, float *dst_dd,
1903
- queue_ptr stream) {
1904
-
1905
- GGML_TENSOR_BINARY_OP_LOCALS
1906
-
1907
- const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
1908
- const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
1909
- const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
1910
-
1911
- // strides in elements
1912
- //const size_t s0 = nb0 / ggml_element_size(dst);
1913
- const size_t s1 = nb1 / ggml_element_size(dst);
1914
- const size_t s2 = nb2 / ggml_element_size(dst);
1915
- const size_t s3 = nb3 / ggml_element_size(dst);
1916
-
1917
- const size_t s10 = nb10 / ggml_element_size(src1);
1918
- const size_t s11 = nb11 / ggml_element_size(src1);
1919
- const size_t s12 = nb12 / ggml_element_size(src1);
1920
- //const size_t s13 = nb13 / ggml_element_size(src1);
1921
-
1922
- GGML_ASSERT(ne00 % 2 == 0);
1923
-
1924
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
1925
- [=](sycl::nd_item<3> item_ct1) {
1926
- k_get_rows<qk, qr, dq>(
1927
- src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
1928
- s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
1929
- });
1930
-
1931
- GGML_UNUSED(dst);
1932
- GGML_UNUSED(ctx);
1933
- }
1934
-
1935
- template <typename src0_t>
1936
- static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
1937
- const ggml_tensor *src1, ggml_tensor *dst,
1938
- const src0_t *src0_dd, const int32_t *src1_dd,
1939
- float *dst_dd, queue_ptr stream) {
1940
-
1941
- GGML_TENSOR_BINARY_OP_LOCALS
1942
-
1943
- const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
1944
- const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE;
1945
- const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
1946
-
1947
- // strides in elements
1948
- //const size_t s0 = nb0 / ggml_element_size(dst);
1949
- const size_t s1 = nb1 / ggml_element_size(dst);
1950
- const size_t s2 = nb2 / ggml_element_size(dst);
1951
- const size_t s3 = nb3 / ggml_element_size(dst);
1952
-
1953
- const size_t s10 = nb10 / ggml_element_size(src1);
1954
- const size_t s11 = nb11 / ggml_element_size(src1);
1955
- const size_t s12 = nb12 / ggml_element_size(src1);
1956
- //const size_t s13 = nb13 / ggml_element_size(src1);
1957
-
1958
- {
1959
- dpct::has_capability_or_fail(stream->get_device(),
1960
- {sycl::aspect::fp16});
1961
-
1962
- stream->parallel_for(
1963
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1964
- [=](sycl::nd_item<3> item_ct1) {
1965
- k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
1966
- s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
1967
- });
1968
- }
1969
-
1970
- GGML_UNUSED(dst);
1971
- GGML_UNUSED(ctx);
1972
- }
1973
-
1974
  static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1975
  const int ky, const int kx_padded,
1976
  queue_ptr stream) {
@@ -2494,52 +2361,6 @@ catch (sycl::exception const &exc) {
2494
  std::exit(1);
2495
  }
2496
 
2497
- static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2498
- const ggml_tensor *src1, ggml_tensor *dst,
2499
- const float *src0_d, const float *src1_d,
2500
- float *dst_d, const queue_ptr &stream) {
2501
-
2502
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
2503
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
2504
-
2505
- GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
2506
- GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
2507
- GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
2508
-
2509
- const int32_t * src1_i32 = (const int32_t *) src1_d;
2510
-
2511
- switch (src0->type) {
2512
- case GGML_TYPE_F16:
2513
- get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d,
2514
- src1_i32, dst_d, stream);
2515
- break;
2516
- case GGML_TYPE_F32:
2517
- get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
2518
- break;
2519
- case GGML_TYPE_Q4_0:
2520
- get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
2521
- break;
2522
- case GGML_TYPE_Q4_1:
2523
- get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
2524
- break;
2525
- case GGML_TYPE_Q5_0:
2526
- get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
2527
- break;
2528
- case GGML_TYPE_Q5_1:
2529
- get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
2530
- break;
2531
- case GGML_TYPE_Q8_0:
2532
- get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
2533
- break;
2534
- default:
2535
- // TODO: k-quants
2536
- GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
2537
- GGML_ABORT("fatal error");
2538
- break;
2539
- }
2540
- }
2541
-
2542
-
2543
  static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2544
  const ggml_tensor *src1, ggml_tensor *dst,
2545
  const float *src0_d, const float *src1_d,
@@ -2589,11 +2410,10 @@ inline void ggml_sycl_op_mul_mat_sycl(
2589
  if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2590
  use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
2591
  dst->op_params[0] == GGML_PREC_DEFAULT) {
2592
-
2593
  // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
2594
  ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
2595
  if (src0->type != GGML_TYPE_F16) {
2596
- const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type);
2597
  GGML_ASSERT(to_fp16_sycl != nullptr);
2598
  size_t ne = row_diff*ne00;
2599
  src0_as_f16.alloc(ne);
@@ -2605,7 +2425,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
2605
 
2606
  ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
2607
  if (src1->type != GGML_TYPE_F16) {
2608
- const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
2609
  GGML_ASSERT(to_fp16_sycl != nullptr);
2610
  size_t ne = src1_ncols*ne10;
2611
  src1_as_f16.alloc(ne);
@@ -2626,13 +2446,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
2626
  src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
2627
  dst_f16.get(), dpct::library_data_t::real_half, ldc,
2628
  dpct::library_data_t::real_half)));
2629
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
2630
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
2631
  #else
2632
  auto dnnl_stream = ctx.stream_dnnl(stream);
2633
  DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2634
  src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
2635
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
2636
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
2637
  #endif
2638
  }
@@ -2641,13 +2461,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
2641
  ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
2642
  ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
2643
  if (src0->type != GGML_TYPE_F32) {
2644
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type);
2645
  GGML_ASSERT(to_fp32_sycl != nullptr);
2646
  src0_ddq_as_f32.alloc(row_diff*ne00);
2647
  to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
2648
  }
2649
  if (src1->type != GGML_TYPE_F32) {
2650
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type);
2651
  GGML_ASSERT(to_fp32_sycl != nullptr);
2652
  src1_ddq_as_f32.alloc(src1_ncols*ne10);
2653
  to_fp32_sycl(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
@@ -3085,7 +2905,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
3085
  for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
3086
  const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
3087
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
3088
-
3089
  for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
3090
  if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
3091
  continue;
@@ -3393,7 +3212,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
3393
  // convert src1 to fp16
3394
  ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
3395
  if (src1->type != GGML_TYPE_F16) {
3396
- const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
3397
  const int64_t ne_src1 = ggml_nelements(src1);
3398
  src1_f16_alloc.alloc(ne_src1);
3399
  GGML_ASSERT(to_fp16_sycl != nullptr);
@@ -3509,6 +3328,7 @@ bool ggml_sycl_supports_dmmv(enum ggml_type type) {
3509
  }
3510
 
3511
  static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3512
  const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
3513
  int64_t min_compute_capability = INT_MAX;
3514
 
@@ -3570,6 +3390,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
3570
  ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
3571
  } else if (use_dequantize_mul_mat_vec) {
3572
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
 
3573
  } else if (use_mul_mat_vec_q) {
3574
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
3575
  } else if (use_mul_mat_q) {
@@ -4251,10 +4072,72 @@ catch (sycl::exception const &exc) {
4251
  std::exit(1);
4252
  }
4253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4254
  static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
4255
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4256
  ggml_sycl_set_main_device(sycl_ctx->device);
4257
 
 
4258
 
4259
  for (int i = 0; i < cgraph->n_nodes; i++) {
4260
  ggml_tensor * node = cgraph->nodes[i];
 
39
  #include "ggml-sycl/backend.hpp"
40
  #include "ggml-sycl/presets.hpp"
41
  #include "ggml-sycl/gemm.hpp"
42
+ #include "ggml-sycl/sycl_hw.hpp"
43
+ #include "ggml-sycl/getrows.hpp"
44
 
45
  static bool g_sycl_loaded = false;
46
  int g_ggml_sycl_debug = 0;
47
+ int g_ggml_sycl_disable_optimize = 0;
48
 
49
  static ggml_sycl_device_info ggml_sycl_init() {
50
  ggml_sycl_device_info info = {};
 
67
  for (int i = 0; i < info.device_count; ++i) {
68
  info.devices[i].vmm = 0;
69
  dpct::device_info prop;
70
+ sycl::device device = dpct::dev_mgr::instance().get_device(i);
71
+
72
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
73
+ prop, device)));
74
 
75
  info.default_tensor_split[i] = total_vram;
76
  total_vram += prop.get_global_mem_size();
77
 
78
  info.devices[i].cc =
79
  100 * prop.get_major_version() + 10 * prop.get_minor_version();
80
+ info.devices[i].hw_info = get_device_hw_info(&device);
81
+ info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch);
82
 
83
  info.max_work_group_sizes[i] = prop.get_max_work_group_size();
84
  }
 
117
  global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
118
  }
119
 
120
+ void print_device_opt_feature(int device_count) {
121
+ GGML_LOG_INFO("SYCL Optimization Feature:\n");
122
+ GGML_LOG_INFO(
123
+ "|ID| Device Type|Reorder|\n");
124
+ GGML_LOG_INFO(
125
+ "|--|-------------------|-------|\n");
126
+ std::map<std::string, size_t> DeviceNums;
127
+ for (int id = 0; id < device_count; ++id) {
128
+ sycl::device device = dpct::dev_mgr::instance().get_device(id);
129
+ std::string backend_type = get_device_backend_and_type(device);
130
+ int type_id = DeviceNums[backend_type]++;
131
+ std::stringstream device_type;
132
+ device_type << "[" << backend_type << ":" << std::to_string(type_id)
133
+ << "]";
134
+ std::string device_type_s = device_type.str();
135
+ device_type_s = std::regex_replace(device_type_s, std::regex("ext_oneapi_"), "");
136
+ GGML_LOG_INFO("|%2d|%19s|%7s|\n", id, device_type_s.c_str(),
137
+ ggml_sycl_info().devices[id].opt_feature.reorder ? "Y": "N");
138
+ }
139
+
140
+ }
141
  void ggml_backend_sycl_print_sycl_devices() {
142
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
143
  int device_count = dpct::dev_mgr::instance().device_count();
 
166
  << "]";
167
  print_device_detail(id, device, device_type.str());
168
  }
169
+
170
+ print_device_opt_feature(device_count);
171
  }
172
 
173
  static inline int get_sycl_env(const char *env_name, int default_val) {
 
189
 
190
  if (!initialized) {
191
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
192
+ g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
193
  GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
194
+ GGML_LOG_INFO("Running with Environment Variables:\n");
195
+ GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
196
+ GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
197
+ GGML_LOG_INFO("Build with Macros:\n");
198
  #if defined(GGML_SYCL_FORCE_MMQ)
199
+ GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
200
  #else
201
+ GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: no\n");
202
  #endif
203
  #if defined(GGML_SYCL_F16)
204
+ GGML_LOG_INFO(" GGML_SYCL_F16: yes\n");
205
  #else
206
+ GGML_LOG_INFO(" GGML_SYCL_F16: no\n");
207
  #endif
208
 
209
  /* NOT REMOVE, keep it for next optimize for XMX.
 
275
  void * dev_ptr = nullptr;
276
  queue_ptr stream;
277
  std::string name;
278
+ optimize_feature opt_feature;
279
+ std::vector<ggml_tensor_extra_gpu *> tensor_extras;
280
 
281
+ ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
282
  device(device), dev_ptr(dev_ptr), stream(stream) {
283
  check_allow_gpu_index(device);
284
  name = (GGML_SYCL_NAME + std::to_string(device));
285
+ opt_feature = ggml_sycl_info().devices[device].opt_feature;
286
  }
287
 
 
288
  ~ggml_backend_sycl_buffer_context() {
289
  if (dev_ptr != nullptr) {
290
  ggml_sycl_set_device(device);
291
  SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
292
  }
293
+
294
+ //release extra used by tensors
295
+ for (ggml_tensor_extra_gpu * extra : tensor_extras) {
296
+ release_extra_gpu(extra);
297
+ }
298
+
299
  }
300
  };
301
 
 
333
  return;
334
  }
335
 
336
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
337
+ tensor->extra = extra;
338
+ ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
339
 
340
  if (ggml_is_quantized(tensor->type)) {
341
  // initialize padding to 0 to avoid possible NaN values
 
361
  size_t size) try {
362
 
363
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
 
364
  ggml_sycl_set_device(ctx->device);
365
  auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
366
  SYCL_CHECK(
 
704
  struct ggml_backend_sycl_split_buffer_context {
705
  ~ggml_backend_sycl_split_buffer_context() try {
706
  for (ggml_tensor_extra_gpu * extra : tensor_extras) {
707
+ release_extra_gpu(extra, streams);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
  }
709
  }
710
  catch (sycl::exception const &exc) {
 
742
  ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
743
 
744
  ctx->tensor_extras.push_back(extra);
745
+ ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
746
 
747
  for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
748
  int64_t row_low, row_high;
 
1356
  reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
1357
  }
1358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1359
  static void mul_mat_p021_f16_f32(
1360
  const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1361
  const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
 
1838
  o_ptr[cur_oh * ow + cur_ow] = res;
1839
  }
1840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1841
  static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1842
  const int ky, const int kx_padded,
1843
  queue_ptr stream) {
 
2361
  std::exit(1);
2362
  }
2363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2364
  static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2365
  const ggml_tensor *src1, ggml_tensor *dst,
2366
  const float *src0_d, const float *src1_d,
 
2410
  if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2411
  use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
2412
  dst->op_params[0] == GGML_PREC_DEFAULT) {
 
2413
  // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
2414
  ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
2415
  if (src0->type != GGML_TYPE_F16) {
2416
+ const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
2417
  GGML_ASSERT(to_fp16_sycl != nullptr);
2418
  size_t ne = row_diff*ne00;
2419
  src0_as_f16.alloc(ne);
 
2425
 
2426
  ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
2427
  if (src1->type != GGML_TYPE_F16) {
2428
+ const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
2429
  GGML_ASSERT(to_fp16_sycl != nullptr);
2430
  size_t ne = src1_ncols*ne10;
2431
  src1_as_f16.alloc(ne);
 
2446
  src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
2447
  dst_f16.get(), dpct::library_data_t::real_half, ldc,
2448
  dpct::library_data_t::real_half)));
2449
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2450
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
2451
  #else
2452
  auto dnnl_stream = ctx.stream_dnnl(stream);
2453
  DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2454
  src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
2455
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2456
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
2457
  #endif
2458
  }
 
2461
  ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
2462
  ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
2463
  if (src0->type != GGML_TYPE_F32) {
2464
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
2465
  GGML_ASSERT(to_fp32_sycl != nullptr);
2466
  src0_ddq_as_f32.alloc(row_diff*ne00);
2467
  to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
2468
  }
2469
  if (src1->type != GGML_TYPE_F32) {
2470
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
2471
  GGML_ASSERT(to_fp32_sycl != nullptr);
2472
  src1_ddq_as_f32.alloc(src1_ncols*ne10);
2473
  to_fp32_sycl(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
 
2905
  for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
2906
  const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
2907
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
 
2908
  for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
2909
  if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
2910
  continue;
 
3212
  // convert src1 to fp16
3213
  ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
3214
  if (src1->type != GGML_TYPE_F16) {
3215
+ const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
3216
  const int64_t ne_src1 = ggml_nelements(src1);
3217
  src1_f16_alloc.alloc(ne_src1);
3218
  GGML_ASSERT(to_fp16_sycl != nullptr);
 
3328
  }
3329
 
3330
  static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3331
+
3332
  const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
3333
  int64_t min_compute_capability = INT_MAX;
3334
 
 
3390
  ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
3391
  } else if (use_dequantize_mul_mat_vec) {
3392
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
3393
+ // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
3394
  } else if (use_mul_mat_vec_q) {
3395
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
3396
  } else if (use_mul_mat_q) {
 
4072
  std::exit(1);
4073
  }
4074
 
4075
+ void reorder_qw(char *data_device, const int ncols, const int nrows,
4076
+ size_t size, size_t offset, dpct::queue_ptr stream) {
4077
+ auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
4078
+ SYCL_CHECK(
4079
+ CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
4080
+ .wait()));
4081
+ GGML_ASSERT((size % sizeof(block_q4_0) == 0));
4082
+ GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
4083
+ int offset_blks = offset / sizeof(block_q4_0);
4084
+ auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
4085
+ auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
4086
+
4087
+ stream->parallel_for(
4088
+ size / sizeof(block_q4_0),
4089
+ [=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
4090
+ const block_q4_0* x = (const block_q4_0*)tmp_buf;
4091
+ const int ib = i;
4092
+
4093
+ for (int j = 0; j < QK4_0/2; j ++)
4094
+ {
4095
+ *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
4096
+ }
4097
+ *(d_ptr + ib) = x[ib].d;
4098
+ });
4099
+
4100
+ sycl::free(tmp_buf, *stream);
4101
+ }
4102
+
4103
+ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
4104
+ char*data_device = (char*)src0->data;
4105
+ size_t ncols = src0->ne[0];
4106
+ size_t nrows = src0->ne[1];
4107
+ size_t size = ggml_nbytes(src0);
4108
+
4109
+ reorder_qw(data_device, ncols, nrows, size, 0, stream);
4110
+ }
4111
+
4112
+ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
4113
+ ggml_tensor *src0 = dst->src[0];
4114
+ ggml_tensor *src1 = dst->src[1];
4115
+
4116
+ if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
4117
+ src1->ne[2]==1 && src1->ne[3]==1) {
4118
+ reorder_qw(src0, stream);
4119
+ ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
4120
+ GGML_ASSERT(extra);
4121
+ extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
4122
+ }
4123
+ }
4124
+
4125
+ void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
4126
+ dpct::queue_ptr stream = ctx->stream();
4127
+ if (ctx->optimized_graph) {
4128
+ return;
4129
+ }
4130
+ ctx->optimized_graph = true;
4131
+
4132
+ for (int i = 0; i < cgraph->n_nodes; i++) {
4133
+ if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
4134
+ }
4135
+ }
4136
  static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
4137
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4138
  ggml_sycl_set_main_device(sycl_ctx->device);
4139
 
4140
+ if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
4141
 
4142
  for (int i = 0; i < cgraph->n_nodes; i++) {
4143
  ggml_tensor * node = cgraph->nodes[i];
ggml/src/ggml-sycl/sycl_hw.cpp ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "sycl_hw.hpp"
2
+
3
+
4
+ sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
5
+ sycl_hw_info res;
6
+ int32_t id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
7
+ res.device_id = id;
8
+
9
+ syclex::architecture arch = device_ptr->get_info<syclex::info::device::architecture>();
10
+ res.arch = arch;
11
+
12
+ return res;
13
+ }
ggml/src/ggml-sycl/sycl_hw.hpp ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef SYCL_HW_HPP
2
+ #define SYCL_HW_HPP
3
+
4
+ #include <algorithm>
5
+ #include <stdio.h>
6
+ #include <vector>
7
+ #include <map>
8
+
9
+ #include <sycl/sycl.hpp>
10
+
11
+ namespace syclex = sycl::ext::oneapi::experimental;
12
+
13
+ struct sycl_hw_info {
14
+ syclex::architecture arch;
15
+ int32_t device_id;
16
+ };
17
+
18
+ bool is_in_vector(std::vector<int> &vec, int item);
19
+
20
+ sycl_hw_info get_device_hw_info(sycl::device *device_ptr);
21
+
22
+
23
+ #endif // SYCL_HW_HPP