Łukasz Ślusarczyk commited on
Commit
77ff985
·
1 Parent(s): 727de7e

fixed compilation warnings in ggml-sycl (llama/12424)

Browse files
ggml/src/ggml-sycl/convert.cpp CHANGED
@@ -138,7 +138,7 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int
138
  stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
139
  sycl::range<3>(1, 1, WARP_SIZE),
140
  sycl::range<3>(1, 1, WARP_SIZE)),
141
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
142
  dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
143
  });
144
 
 
138
  stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
139
  sycl::range<3>(1, 1, WARP_SIZE),
140
  sycl::range<3>(1, 1, WARP_SIZE)),
141
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
142
  dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
143
  });
144
 
ggml/src/ggml-sycl/dmmv.cpp CHANGED
@@ -210,7 +210,7 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
210
 
211
  stream->parallel_for(
212
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
213
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
214
  dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
215
  nrows, item_ct1);
216
  });
@@ -879,7 +879,7 @@ static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloa
879
 
880
  stream->parallel_for(
881
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
882
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
883
  dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
884
  vx, y, dst, ncols, nrows, item_ct1);
885
  });
@@ -902,7 +902,7 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
902
 
903
  stream->parallel_for(
904
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
905
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
906
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
907
  vx, y, dst, ncols, nrows, item_ct1);
908
  });
@@ -923,7 +923,7 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
923
 
924
  stream->parallel_for(
925
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
926
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
927
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
928
  vx, y, dst, ncols, nrows, item_ct1);
929
  });
@@ -944,7 +944,7 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
944
 
945
  stream->parallel_for(
946
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
947
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
948
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
949
  vx, y, dst, ncols, nrows, item_ct1);
950
  });
@@ -965,7 +965,7 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
965
 
966
  stream->parallel_for(
967
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
968
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
969
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
970
  vx, y, dst, ncols, nrows, item_ct1);
971
  });
@@ -986,7 +986,7 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
986
 
987
  stream->parallel_for(
988
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
989
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
990
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
991
  vx, y, dst, ncols, nrows, item_ct1);
992
  });
@@ -1004,7 +1004,7 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
1004
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1005
  stream->parallel_for(
1006
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1007
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
1008
  dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
1009
  });
1010
  }
@@ -1020,7 +1020,7 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
1020
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1021
  stream->parallel_for(
1022
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1023
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
1024
  dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
1025
  });
1026
  }
@@ -1036,7 +1036,7 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
1036
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1037
  stream->parallel_for(
1038
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1039
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
1040
  dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
1041
  });
1042
  }
@@ -1049,7 +1049,7 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
1049
  const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
1050
  stream->parallel_for(
1051
  sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
1052
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
1053
  dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
1054
  });
1055
  }
@@ -1065,7 +1065,7 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
1065
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1066
  stream->parallel_for(
1067
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1068
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
1069
  dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
1070
  });
1071
  }
@@ -1143,7 +1143,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
1143
  default:
1144
  printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
1145
  GGML_ABORT("fatal error");
1146
- break;
1147
  }
1148
 
1149
  GGML_UNUSED(src1);
 
210
 
211
  stream->parallel_for(
212
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
213
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
214
  dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
215
  nrows, item_ct1);
216
  });
 
879
 
880
  stream->parallel_for(
881
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
882
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
883
  dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
884
  vx, y, dst, ncols, nrows, item_ct1);
885
  });
 
902
 
903
  stream->parallel_for(
904
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
905
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
906
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
907
  vx, y, dst, ncols, nrows, item_ct1);
908
  });
 
923
 
924
  stream->parallel_for(
925
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
926
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
927
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
928
  vx, y, dst, ncols, nrows, item_ct1);
929
  });
 
944
 
945
  stream->parallel_for(
946
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
947
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
948
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
949
  vx, y, dst, ncols, nrows, item_ct1);
950
  });
 
965
 
966
  stream->parallel_for(
967
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
968
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
969
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
970
  vx, y, dst, ncols, nrows, item_ct1);
971
  });
 
986
 
987
  stream->parallel_for(
988
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
989
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
990
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
991
  vx, y, dst, ncols, nrows, item_ct1);
992
  });
 
1004
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1005
  stream->parallel_for(
1006
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1007
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1008
  dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
1009
  });
1010
  }
 
1020
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1021
  stream->parallel_for(
1022
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1023
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1024
  dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
1025
  });
1026
  }
 
1036
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1037
  stream->parallel_for(
1038
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1039
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1040
  dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
1041
  });
1042
  }
 
1049
  const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
1050
  stream->parallel_for(
1051
  sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
1052
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1053
  dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
1054
  });
1055
  }
 
1065
  const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1066
  stream->parallel_for(
1067
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1068
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1069
  dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
1070
  });
1071
  }
 
1143
  default:
1144
  printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
1145
  GGML_ABORT("fatal error");
 
1146
  }
1147
 
1148
  GGML_UNUSED(src1);
ggml/src/ggml-sycl/element_wise.cpp CHANGED
@@ -1,7 +1,7 @@
1
  #include "common.hpp"
2
  #include "element_wise.hpp"
3
 
4
- void acc_f32(const float * x, const float * y, float * dst, const int ne,
5
  const int ne10, const int ne11, const int ne12,
6
  const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
7
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
@@ -20,7 +20,7 @@ void acc_f32(const float * x, const float * y, float * dst, const int ne,
20
  }
21
  }
22
 
23
- void gelu_f32(const float * x, float * dst, const int k,
24
  const sycl::nd_item<3> &item_ct1) {
25
  const float GELU_COEF_A = 0.044715f;
26
  const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@@ -37,7 +37,7 @@ void gelu_f32(const float * x, float * dst, const int k,
37
  sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
38
  }
39
 
40
- void silu_f32(const float * x, float * dst, const int k,
41
  const sycl::nd_item<3> &item_ct1) {
42
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
43
  item_ct1.get_local_id(2);
@@ -48,7 +48,7 @@ void silu_f32(const float * x, float * dst, const int k,
48
  dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
49
  }
50
 
51
- void gelu_quick_f32(const float *x, float *dst, int k,
52
  const sycl::nd_item<3> &item_ct1) {
53
  const float GELU_QUICK_COEF = -1.702f;
54
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
@@ -59,7 +59,7 @@ void gelu_quick_f32(const float *x, float *dst, int k,
59
  dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
60
  }
61
 
62
- void tanh_f32(const float *x, float *dst, int k,
63
  const sycl::nd_item<3> &item_ct1) {
64
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
65
  item_ct1.get_local_id(2);
@@ -69,7 +69,7 @@ void tanh_f32(const float *x, float *dst, int k,
69
  dst[i] = sycl::tanh((float)(x[i]));
70
  }
71
 
72
- void relu_f32(const float * x, float * dst, const int k,
73
  const sycl::nd_item<3> &item_ct1) {
74
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
75
  item_ct1.get_local_id(2);
@@ -80,7 +80,7 @@ void relu_f32(const float * x, float * dst, const int k,
80
  dst[i] = sycl::fmax((float)(x[i]), (float)0);
81
  }
82
 
83
- void sigmoid_f32(const float * x, float * dst, const int k,
84
  const sycl::nd_item<3> &item_ct1) {
85
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
86
  item_ct1.get_local_id(2);
@@ -91,7 +91,7 @@ void sigmoid_f32(const float * x, float * dst, const int k,
91
  dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
92
  }
93
 
94
- void sqrt_f32(const float * x, float * dst, const int k,
95
  const sycl::nd_item<3> &item_ct1) {
96
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
97
  item_ct1.get_local_id(2);
@@ -102,7 +102,7 @@ void sqrt_f32(const float * x, float * dst, const int k,
102
  dst[i] = sycl::sqrt(x[i]);
103
  }
104
 
105
- void sin_f32(const float * x, float * dst, const int k,
106
  const sycl::nd_item<3> &item_ct1) {
107
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
108
  item_ct1.get_local_id(2);
@@ -113,7 +113,7 @@ void sin_f32(const float * x, float * dst, const int k,
113
  dst[i] = sycl::sin(x[i]);
114
  }
115
 
116
- void cos_f32(const float * x, float * dst, const int k,
117
  const sycl::nd_item<3> &item_ct1) {
118
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
119
  item_ct1.get_local_id(2);
@@ -124,7 +124,7 @@ void cos_f32(const float * x, float * dst, const int k,
124
  dst[i] = sycl::cos(x[i]);
125
  }
126
 
127
- void hardsigmoid_f32(const float * x, float * dst, const int k,
128
  const sycl::nd_item<3> &item_ct1) {
129
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
130
  item_ct1.get_local_id(2);
@@ -135,7 +135,7 @@ void hardsigmoid_f32(const float * x, float * dst, const int k,
135
  dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
136
  }
137
 
138
- void hardswish_f32(const float * x, float * dst, const int k,
139
  const sycl::nd_item<3> &item_ct1) {
140
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
141
  item_ct1.get_local_id(2);
@@ -146,7 +146,7 @@ void hardswish_f32(const float * x, float * dst, const int k,
146
  dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
147
  }
148
 
149
- void exp_f32(const float * x, float * dst, const int k,
150
  const sycl::nd_item<3> &item_ct1) {
151
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
152
  item_ct1.get_local_id(2);
@@ -157,7 +157,7 @@ void exp_f32(const float * x, float * dst, const int k,
157
  dst[i] = sycl::exp(x[i]);
158
  }
159
 
160
- void log_f32(const float * x, float * dst, const int k,
161
  const sycl::nd_item<3> &item_ct1) {
162
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
163
  item_ct1.get_local_id(2);
@@ -173,7 +173,7 @@ void log_f32(const float * x, float * dst, const int k,
173
  }
174
  }
175
 
176
- void neg_f32(const float * x, float * dst, const int k,
177
  const sycl::nd_item<3> &item_ct1) {
178
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
179
  item_ct1.get_local_id(2);
@@ -184,7 +184,7 @@ void neg_f32(const float * x, float * dst, const int k,
184
  dst[i] = -x[i];
185
  }
186
 
187
- void step_f32(const float * x, float * dst, const int k,
188
  const sycl::nd_item<3> &item_ct1) {
189
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
190
  item_ct1.get_local_id(2);
@@ -195,7 +195,7 @@ void step_f32(const float * x, float * dst, const int k,
195
  dst[i] = x[i] > 0.0f;
196
  }
197
 
198
- void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
199
  const sycl::nd_item<3> &item_ct1) {
200
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
201
  item_ct1.get_local_id(2);
@@ -206,7 +206,7 @@ void leaky_relu_f32(const float *x, float *dst, const int k, const float negativ
206
  sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
207
  }
208
 
209
- void sqr_f32(const float * x, float * dst, const int k,
210
  const sycl::nd_item<3> &item_ct1) {
211
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
212
  item_ct1.get_local_id(2);
@@ -217,7 +217,7 @@ void sqr_f32(const float * x, float * dst, const int k,
217
  dst[i] = x[i] * x[i];
218
  }
219
 
220
- void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
221
  const int nb02, const int nb03, const int ne10, const int ne11,
222
  const int ne12, const int ne13, const float sf0, const float sf1,
223
  const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
@@ -240,7 +240,7 @@ void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
240
  dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
241
  }
242
 
243
- void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
244
  const sycl::nd_item<3> &item_ct1) {
245
  int nidx = item_ct1.get_local_id(2) +
246
  item_ct1.get_group(2) * item_ct1.get_local_range(2);
@@ -262,7 +262,7 @@ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const i
262
 
263
 
264
 
265
- void acc_f32_sycl(const float *x, const float *y, float *dst,
266
  const int n_elements, const int ne10, const int ne11,
267
  const int ne12, const int nb1, const int nb2,
268
  const int offset, queue_ptr stream) {
@@ -277,7 +277,7 @@ void acc_f32_sycl(const float *x, const float *y, float *dst,
277
  });
278
  }
279
 
280
- void gelu_f32_sycl(const float *x, float *dst, const int k,
281
  queue_ptr stream) {
282
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
283
  stream->parallel_for(
@@ -289,7 +289,7 @@ void gelu_f32_sycl(const float *x, float *dst, const int k,
289
  });
290
  }
291
 
292
- void silu_f32_sycl(const float *x, float *dst, const int k,
293
  queue_ptr stream) {
294
  const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
295
  stream->parallel_for(
@@ -301,7 +301,7 @@ void silu_f32_sycl(const float *x, float *dst, const int k,
301
  });
302
  }
303
 
304
- void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
305
  queue_ptr stream) {
306
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
307
  stream->parallel_for(
@@ -313,7 +313,7 @@ void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
313
  });
314
  }
315
 
316
- void tanh_f32_sycl(const float *x, float *dst, const int k,
317
  queue_ptr stream) {
318
  const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
319
  stream->parallel_for(
@@ -325,7 +325,7 @@ void tanh_f32_sycl(const float *x, float *dst, const int k,
325
  });
326
  }
327
 
328
- void relu_f32_sycl(const float *x, float *dst, const int k,
329
  queue_ptr stream) {
330
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
331
  stream->parallel_for(
@@ -337,7 +337,7 @@ void relu_f32_sycl(const float *x, float *dst, const int k,
337
  });
338
  }
339
 
340
- void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
341
  queue_ptr stream) {
342
  const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
343
  stream->parallel_for(
@@ -349,7 +349,7 @@ void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
349
  });
350
  }
351
 
352
- void hardswish_f32_sycl(const float *x, float *dst, const int k,
353
  queue_ptr stream) {
354
  const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
355
  stream->parallel_for(
@@ -361,7 +361,7 @@ void hardswish_f32_sycl(const float *x, float *dst, const int k,
361
  });
362
  }
363
 
364
- void exp_f32_sycl(const float *x, float *dst, const int k,
365
  queue_ptr stream) {
366
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
367
  stream->parallel_for(
@@ -373,7 +373,7 @@ void exp_f32_sycl(const float *x, float *dst, const int k,
373
  });
374
  }
375
 
376
- void log_f32_sycl(const float *x, float *dst, const int k,
377
  queue_ptr stream) {
378
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
379
  stream->parallel_for(
@@ -385,7 +385,7 @@ void log_f32_sycl(const float *x, float *dst, const int k,
385
  });
386
  }
387
 
388
- void neg_f32_sycl(const float *x, float *dst, const int k,
389
  queue_ptr stream) {
390
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
391
  stream->parallel_for(
@@ -397,7 +397,7 @@ void neg_f32_sycl(const float *x, float *dst, const int k,
397
  });
398
  }
399
 
400
- void step_f32_sycl(const float *x, float *dst, const int k,
401
  queue_ptr stream) {
402
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
403
  stream->parallel_for(
@@ -409,7 +409,7 @@ void step_f32_sycl(const float *x, float *dst, const int k,
409
  });
410
  }
411
 
412
- void sigmoid_f32_sycl(const float *x, float *dst, const int k,
413
  queue_ptr stream) {
414
  const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
415
  stream->parallel_for(
@@ -421,7 +421,7 @@ void sigmoid_f32_sycl(const float *x, float *dst, const int k,
421
  });
422
  }
423
 
424
- void sqrt_f32_sycl(const float *x, float *dst, const int k,
425
  queue_ptr stream) {
426
  const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
427
  stream->parallel_for(
@@ -433,7 +433,7 @@ void sqrt_f32_sycl(const float *x, float *dst, const int k,
433
  });
434
  }
435
 
436
- void sin_f32_sycl(const float *x, float *dst, const int k,
437
  queue_ptr stream) {
438
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
439
  stream->parallel_for(
@@ -445,7 +445,7 @@ void sin_f32_sycl(const float *x, float *dst, const int k,
445
  });
446
  }
447
 
448
- void cos_f32_sycl(const float *x, float *dst, const int k,
449
  queue_ptr stream) {
450
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
451
  stream->parallel_for(
@@ -457,7 +457,7 @@ void cos_f32_sycl(const float *x, float *dst, const int k,
457
  });
458
  }
459
 
460
- void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
461
  const float negative_slope,
462
  queue_ptr stream) {
463
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
@@ -470,7 +470,7 @@ void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
470
  });
471
  }
472
 
473
- void sqr_f32_sycl(const float *x, float *dst, const int k,
474
  queue_ptr stream) {
475
  const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
476
  stream->parallel_for(
@@ -482,7 +482,7 @@ void sqr_f32_sycl(const float *x, float *dst, const int k,
482
  });
483
  }
484
 
485
- void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
486
  const int nb02, const int nb03, const int ne10, const int ne11,
487
  const int ne12, const int ne13, const float sf0, const float sf1,
488
  const float sf2, const float sf3, queue_ptr stream) {
@@ -496,7 +496,7 @@ void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01
496
  });
497
  }
498
 
499
- void pad_f32_sycl(const float *x, float *dst, const int ne00,
500
  const int ne01, const int ne02, const int ne0,
501
  const int ne1, const int ne2, queue_ptr stream) {
502
  int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
 
1
  #include "common.hpp"
2
  #include "element_wise.hpp"
3
 
4
+ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
5
  const int ne10, const int ne11, const int ne12,
6
  const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
7
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
 
20
  }
21
  }
22
 
23
+ static void gelu_f32(const float * x, float * dst, const int k,
24
  const sycl::nd_item<3> &item_ct1) {
25
  const float GELU_COEF_A = 0.044715f;
26
  const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
 
37
  sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
38
  }
39
 
40
+ static void silu_f32(const float * x, float * dst, const int k,
41
  const sycl::nd_item<3> &item_ct1) {
42
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
43
  item_ct1.get_local_id(2);
 
48
  dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
49
  }
50
 
51
+ static void gelu_quick_f32(const float *x, float *dst, int k,
52
  const sycl::nd_item<3> &item_ct1) {
53
  const float GELU_QUICK_COEF = -1.702f;
54
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
 
59
  dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
60
  }
61
 
62
+ static void tanh_f32(const float *x, float *dst, int k,
63
  const sycl::nd_item<3> &item_ct1) {
64
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
65
  item_ct1.get_local_id(2);
 
69
  dst[i] = sycl::tanh((float)(x[i]));
70
  }
71
 
72
+ static void relu_f32(const float * x, float * dst, const int k,
73
  const sycl::nd_item<3> &item_ct1) {
74
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
75
  item_ct1.get_local_id(2);
 
80
  dst[i] = sycl::fmax((float)(x[i]), (float)0);
81
  }
82
 
83
+ static void sigmoid_f32(const float * x, float * dst, const int k,
84
  const sycl::nd_item<3> &item_ct1) {
85
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
86
  item_ct1.get_local_id(2);
 
91
  dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
92
  }
93
 
94
+ static void sqrt_f32(const float * x, float * dst, const int k,
95
  const sycl::nd_item<3> &item_ct1) {
96
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
97
  item_ct1.get_local_id(2);
 
102
  dst[i] = sycl::sqrt(x[i]);
103
  }
104
 
105
+ static void sin_f32(const float * x, float * dst, const int k,
106
  const sycl::nd_item<3> &item_ct1) {
107
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
108
  item_ct1.get_local_id(2);
 
113
  dst[i] = sycl::sin(x[i]);
114
  }
115
 
116
+ static void cos_f32(const float * x, float * dst, const int k,
117
  const sycl::nd_item<3> &item_ct1) {
118
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
119
  item_ct1.get_local_id(2);
 
124
  dst[i] = sycl::cos(x[i]);
125
  }
126
 
127
+ static void hardsigmoid_f32(const float * x, float * dst, const int k,
128
  const sycl::nd_item<3> &item_ct1) {
129
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
130
  item_ct1.get_local_id(2);
 
135
  dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
136
  }
137
 
138
+ static void hardswish_f32(const float * x, float * dst, const int k,
139
  const sycl::nd_item<3> &item_ct1) {
140
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
141
  item_ct1.get_local_id(2);
 
146
  dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
147
  }
148
 
149
+ static void exp_f32(const float * x, float * dst, const int k,
150
  const sycl::nd_item<3> &item_ct1) {
151
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
152
  item_ct1.get_local_id(2);
 
157
  dst[i] = sycl::exp(x[i]);
158
  }
159
 
160
+ static void log_f32(const float * x, float * dst, const int k,
161
  const sycl::nd_item<3> &item_ct1) {
162
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
163
  item_ct1.get_local_id(2);
 
173
  }
174
  }
175
 
176
+ static void neg_f32(const float * x, float * dst, const int k,
177
  const sycl::nd_item<3> &item_ct1) {
178
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
179
  item_ct1.get_local_id(2);
 
184
  dst[i] = -x[i];
185
  }
186
 
187
+ static void step_f32(const float * x, float * dst, const int k,
188
  const sycl::nd_item<3> &item_ct1) {
189
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
190
  item_ct1.get_local_id(2);
 
195
  dst[i] = x[i] > 0.0f;
196
  }
197
 
198
+ static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
199
  const sycl::nd_item<3> &item_ct1) {
200
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
201
  item_ct1.get_local_id(2);
 
206
  sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
207
  }
208
 
209
+ static void sqr_f32(const float * x, float * dst, const int k,
210
  const sycl::nd_item<3> &item_ct1) {
211
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
212
  item_ct1.get_local_id(2);
 
217
  dst[i] = x[i] * x[i];
218
  }
219
 
220
+ static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
221
  const int nb02, const int nb03, const int ne10, const int ne11,
222
  const int ne12, const int ne13, const float sf0, const float sf1,
223
  const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
 
240
  dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
241
  }
242
 
243
+ static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
244
  const sycl::nd_item<3> &item_ct1) {
245
  int nidx = item_ct1.get_local_id(2) +
246
  item_ct1.get_group(2) * item_ct1.get_local_range(2);
 
262
 
263
 
264
 
265
+ static void acc_f32_sycl(const float *x, const float *y, float *dst,
266
  const int n_elements, const int ne10, const int ne11,
267
  const int ne12, const int nb1, const int nb2,
268
  const int offset, queue_ptr stream) {
 
277
  });
278
  }
279
 
280
+ static void gelu_f32_sycl(const float *x, float *dst, const int k,
281
  queue_ptr stream) {
282
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
283
  stream->parallel_for(
 
289
  });
290
  }
291
 
292
+ static void silu_f32_sycl(const float *x, float *dst, const int k,
293
  queue_ptr stream) {
294
  const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
295
  stream->parallel_for(
 
301
  });
302
  }
303
 
304
+ static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
305
  queue_ptr stream) {
306
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
307
  stream->parallel_for(
 
313
  });
314
  }
315
 
316
+ static void tanh_f32_sycl(const float *x, float *dst, const int k,
317
  queue_ptr stream) {
318
  const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
319
  stream->parallel_for(
 
325
  });
326
  }
327
 
328
+ static void relu_f32_sycl(const float *x, float *dst, const int k,
329
  queue_ptr stream) {
330
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
331
  stream->parallel_for(
 
337
  });
338
  }
339
 
340
+ static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
341
  queue_ptr stream) {
342
  const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
343
  stream->parallel_for(
 
349
  });
350
  }
351
 
352
+ static void hardswish_f32_sycl(const float *x, float *dst, const int k,
353
  queue_ptr stream) {
354
  const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
355
  stream->parallel_for(
 
361
  });
362
  }
363
 
364
+ static void exp_f32_sycl(const float *x, float *dst, const int k,
365
  queue_ptr stream) {
366
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
367
  stream->parallel_for(
 
373
  });
374
  }
375
 
376
+ static void log_f32_sycl(const float *x, float *dst, const int k,
377
  queue_ptr stream) {
378
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
379
  stream->parallel_for(
 
385
  });
386
  }
387
 
388
+ static void neg_f32_sycl(const float *x, float *dst, const int k,
389
  queue_ptr stream) {
390
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
391
  stream->parallel_for(
 
397
  });
398
  }
399
 
400
+ static void step_f32_sycl(const float *x, float *dst, const int k,
401
  queue_ptr stream) {
402
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
403
  stream->parallel_for(
 
409
  });
410
  }
411
 
412
+ static void sigmoid_f32_sycl(const float *x, float *dst, const int k,
413
  queue_ptr stream) {
414
  const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
415
  stream->parallel_for(
 
421
  });
422
  }
423
 
424
+ static void sqrt_f32_sycl(const float *x, float *dst, const int k,
425
  queue_ptr stream) {
426
  const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
427
  stream->parallel_for(
 
433
  });
434
  }
435
 
436
+ static void sin_f32_sycl(const float *x, float *dst, const int k,
437
  queue_ptr stream) {
438
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
439
  stream->parallel_for(
 
445
  });
446
  }
447
 
448
+ static void cos_f32_sycl(const float *x, float *dst, const int k,
449
  queue_ptr stream) {
450
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
451
  stream->parallel_for(
 
457
  });
458
  }
459
 
460
+ static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
461
  const float negative_slope,
462
  queue_ptr stream) {
463
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
 
470
  });
471
  }
472
 
473
+ static void sqr_f32_sycl(const float *x, float *dst, const int k,
474
  queue_ptr stream) {
475
  const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
476
  stream->parallel_for(
 
482
  });
483
  }
484
 
485
+ static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
486
  const int nb02, const int nb03, const int ne10, const int ne11,
487
  const int ne12, const int ne13, const float sf0, const float sf1,
488
  const float sf2, const float sf3, queue_ptr stream) {
 
496
  });
497
  }
498
 
499
+ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
500
  const int ne01, const int ne02, const int ne0,
501
  const int ne1, const int ne2, queue_ptr stream) {
502
  int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
ggml/src/ggml-sycl/getrows.cpp CHANGED
@@ -207,7 +207,7 @@ static void get_rows_sycl_reorder(ggml_backend_sycl_context & ctx, const ggml_te
207
  const size_t nrows = ne01;
208
  const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2);
209
  stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
210
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
211
  k_get_rows_reorder<qk, qr, dq_reorder>(
212
  src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2,
213
  s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
@@ -302,7 +302,6 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *s
302
  // TODO: k-quants
303
  GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
304
  GGML_ABORT("fatal error");
305
- break;
306
  }
307
  }
308
 
 
207
  const size_t nrows = ne01;
208
  const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2);
209
  stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
210
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
211
  k_get_rows_reorder<qk, qr, dq_reorder>(
212
  src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2,
213
  s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
 
302
  // TODO: k-quants
303
  GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
304
  GGML_ABORT("fatal error");
 
305
  }
306
  }
307
 
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -95,7 +95,7 @@ const ggml_sycl_device_info & ggml_sycl_info() {
95
  return info;
96
  }
97
 
98
- void print_device_detail(int id, sycl::device &device, std::string device_type) {
99
 
100
  dpct::device_info prop;
101
  SYCL_CHECK(CHECK_TRY_ERROR(
@@ -118,7 +118,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
118
  global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
119
  }
120
 
121
- void print_device_opt_feature(int device_count) {
122
  GGML_LOG_INFO("SYCL Optimization Feature:\n");
123
  GGML_LOG_INFO(
124
  "|ID| Device Type|Reorder|\n");
@@ -401,7 +401,7 @@ catch (sycl::exception const &exc) {
401
  std::exit(1);
402
  }
403
 
404
- void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
405
  const void *ptr_src, size_t size) {
406
  char *host_buf = (char *)malloc(size);
407
  q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
@@ -620,7 +620,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
620
  return &ggml_backend_sycl_buffer_types[device];
621
  }
622
 
623
- ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
624
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
625
 
626
  int device = ctx->device;
@@ -1682,7 +1682,7 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1682
 
1683
  stream->parallel_for(
1684
  sycl::nd_range<3>(num_blocks * block_size, block_size),
1685
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1686
  quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
1687
  });
1688
  }
@@ -1703,7 +1703,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
1703
 
1704
  stream->parallel_for(
1705
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1706
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1707
  mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
1708
  nchannels_y, item_ct1);
1709
  });
@@ -1723,7 +1723,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
1723
 
1724
  stream->parallel_for(
1725
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1726
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1727
  mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
1728
  row_stride_x, channel_stride_x,
1729
  nchannels_y / nchannels_x, item_ct1);
@@ -1764,7 +1764,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
1764
  const sycl::range<3> block_nums(1, nrows, 1);
1765
  stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
1766
  [=](sycl::nd_item<3> item_ct1)
1767
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1768
  k_sum_rows_f32(x, dst, ncols, item_ct1);
1769
  });
1770
  }
@@ -2920,7 +2920,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
2920
  return false;
2921
  }
2922
 
2923
- bool ggml_sycl_supports_dmmv(enum ggml_type type) {
2924
  switch (type) {
2925
  case GGML_TYPE_Q4_0:
2926
  case GGML_TYPE_Q4_1:
@@ -3293,7 +3293,7 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
3293
  }
3294
 
3295
 
3296
- void ggml_sycl_set_main_device(const int main_device) try {
3297
  if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
3298
  return;
3299
  }
@@ -3314,7 +3314,7 @@ catch (sycl::exception const &exc) {
3314
  std::exit(1);
3315
  }
3316
 
3317
- bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
3318
  if (!g_sycl_loaded) return false;
3319
 
3320
  if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
@@ -3638,7 +3638,7 @@ catch (sycl::exception const &exc) {
3638
  std::exit(1);
3639
  }
3640
 
3641
- void reorder_qw(char *data_device, const int ncols, const int nrows,
3642
  size_t size, size_t offset, dpct::queue_ptr stream) {
3643
  auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
3644
  SYCL_CHECK(
@@ -3652,7 +3652,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
3652
 
3653
  stream->parallel_for(
3654
  size / sizeof(block_q4_0),
3655
- [=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
3656
  const block_q4_0* x = (const block_q4_0*)tmp_buf;
3657
  const int ib = i;
3658
 
@@ -3666,7 +3666,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
3666
  sycl::free(tmp_buf, *stream);
3667
  }
3668
 
3669
- void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
3670
  char*data_device = (char*)src0->data;
3671
  size_t ncols = src0->ne[0];
3672
  size_t nrows = src0->ne[1];
@@ -3675,7 +3675,7 @@ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
3675
  reorder_qw(data_device, ncols, nrows, size, 0, stream);
3676
  }
3677
 
3678
- void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
3679
  ggml_tensor *src0 = dst->src[0];
3680
  ggml_tensor *src1 = dst->src[1];
3681
 
@@ -3688,7 +3688,7 @@ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
3688
  }
3689
  }
3690
 
3691
- void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
3692
  dpct::queue_ptr stream = ctx->stream();
3693
  if (ctx->optimized_graph) {
3694
  return;
@@ -3878,7 +3878,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3878
  return true;
3879
  }
3880
  return false;
3881
- } break;
3882
  case GGML_OP_UNARY:
3883
  switch (ggml_get_unary_op(op)) {
3884
  case GGML_UNARY_OP_NEG:
@@ -3896,7 +3896,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3896
  default:
3897
  return false;
3898
  }
3899
- break;
3900
  case GGML_OP_MUL_MAT:
3901
  case GGML_OP_MUL_MAT_ID:
3902
  {
@@ -3927,7 +3926,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3927
  return false;
3928
  }
3929
  return true;
3930
- } break;
3931
  case GGML_OP_OUT_PROD:
3932
  return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
3933
  case GGML_OP_GET_ROWS:
@@ -3944,7 +3943,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3944
  default:
3945
  return false;
3946
  }
3947
- } break;
3948
  case GGML_OP_CPY:
3949
  {
3950
  ggml_type src0_type = op->src[0]->type;
@@ -3995,12 +3994,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3995
  return true;
3996
  }
3997
  return false;
3998
- } break;
3999
  case GGML_OP_CONCAT:
4000
  {
4001
  ggml_type src0_type = op->src[0]->type;
4002
  return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
4003
- } break;
4004
  case GGML_OP_DUP:
4005
  case GGML_OP_ARGMAX:
4006
  case GGML_OP_NONE:
 
95
  return info;
96
  }
97
 
98
+ static void print_device_detail(int id, sycl::device &device, std::string device_type) {
99
 
100
  dpct::device_info prop;
101
  SYCL_CHECK(CHECK_TRY_ERROR(
 
118
  global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
119
  }
120
 
121
+ static void print_device_opt_feature(int device_count) {
122
  GGML_LOG_INFO("SYCL Optimization Feature:\n");
123
  GGML_LOG_INFO(
124
  "|ID| Device Type|Reorder|\n");
 
401
  std::exit(1);
402
  }
403
 
404
+ static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
405
  const void *ptr_src, size_t size) {
406
  char *host_buf = (char *)malloc(size);
407
  q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
 
620
  return &ggml_backend_sycl_buffer_types[device];
621
  }
622
 
623
+ static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
624
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
625
 
626
  int device = ctx->device;
 
1682
 
1683
  stream->parallel_for(
1684
  sycl::nd_range<3>(num_blocks * block_size, block_size),
1685
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1686
  quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
1687
  });
1688
  }
 
1703
 
1704
  stream->parallel_for(
1705
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1706
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1707
  mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
1708
  nchannels_y, item_ct1);
1709
  });
 
1723
 
1724
  stream->parallel_for(
1725
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1726
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1727
  mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
1728
  row_stride_x, channel_stride_x,
1729
  nchannels_y / nchannels_x, item_ct1);
 
1764
  const sycl::range<3> block_nums(1, nrows, 1);
1765
  stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
1766
  [=](sycl::nd_item<3> item_ct1)
1767
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1768
  k_sum_rows_f32(x, dst, ncols, item_ct1);
1769
  });
1770
  }
 
2920
  return false;
2921
  }
2922
 
2923
+ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
2924
  switch (type) {
2925
  case GGML_TYPE_Q4_0:
2926
  case GGML_TYPE_Q4_1:
 
3293
  }
3294
 
3295
 
3296
+ static void ggml_sycl_set_main_device(const int main_device) try {
3297
  if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
3298
  return;
3299
  }
 
3314
  std::exit(1);
3315
  }
3316
 
3317
+ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
3318
  if (!g_sycl_loaded) return false;
3319
 
3320
  if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
 
3638
  std::exit(1);
3639
  }
3640
 
3641
+ static void reorder_qw(char *data_device, const int ncols, const int nrows,
3642
  size_t size, size_t offset, dpct::queue_ptr stream) {
3643
  auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
3644
  SYCL_CHECK(
 
3652
 
3653
  stream->parallel_for(
3654
  size / sizeof(block_q4_0),
3655
+ [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
3656
  const block_q4_0* x = (const block_q4_0*)tmp_buf;
3657
  const int ib = i;
3658
 
 
3666
  sycl::free(tmp_buf, *stream);
3667
  }
3668
 
3669
+ static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
3670
  char*data_device = (char*)src0->data;
3671
  size_t ncols = src0->ne[0];
3672
  size_t nrows = src0->ne[1];
 
3675
  reorder_qw(data_device, ncols, nrows, size, 0, stream);
3676
  }
3677
 
3678
+ static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
3679
  ggml_tensor *src0 = dst->src[0];
3680
  ggml_tensor *src1 = dst->src[1];
3681
 
 
3688
  }
3689
  }
3690
 
3691
+ static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
3692
  dpct::queue_ptr stream = ctx->stream();
3693
  if (ctx->optimized_graph) {
3694
  return;
 
3878
  return true;
3879
  }
3880
  return false;
3881
+ }
3882
  case GGML_OP_UNARY:
3883
  switch (ggml_get_unary_op(op)) {
3884
  case GGML_UNARY_OP_NEG:
 
3896
  default:
3897
  return false;
3898
  }
 
3899
  case GGML_OP_MUL_MAT:
3900
  case GGML_OP_MUL_MAT_ID:
3901
  {
 
3926
  return false;
3927
  }
3928
  return true;
3929
+ }
3930
  case GGML_OP_OUT_PROD:
3931
  return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
3932
  case GGML_OP_GET_ROWS:
 
3943
  default:
3944
  return false;
3945
  }
3946
+ }
3947
  case GGML_OP_CPY:
3948
  {
3949
  ggml_type src0_type = op->src[0]->type;
 
3994
  return true;
3995
  }
3996
  return false;
3997
+ }
3998
  case GGML_OP_CONCAT:
3999
  {
4000
  ggml_type src0_type = op->src[0]->type;
4001
  return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
4002
+ }
4003
  case GGML_OP_DUP:
4004
  case GGML_OP_ARGMAX:
4005
  case GGML_OP_NONE:
ggml/src/ggml-sycl/mmq.cpp CHANGED
@@ -3017,7 +3017,6 @@ void ggml_sycl_op_mul_mat_q(
3017
  break;
3018
  default:
3019
  GGML_ABORT("fatal error");
3020
- break;
3021
  }
3022
 
3023
  GGML_UNUSED(src1);
 
3017
  break;
3018
  default:
3019
  GGML_ABORT("fatal error");
 
3020
  }
3021
 
3022
  GGML_UNUSED(src1);
ggml/src/ggml-sycl/mmvq.cpp CHANGED
@@ -495,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
495
  cgh.parallel_for(
496
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
497
  [=](sycl::nd_item<3> item_ct1)
498
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
499
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
500
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
501
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -519,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
519
  cgh.parallel_for(
520
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
521
  [=](sycl::nd_item<3> item_ct1)
522
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
523
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
524
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
525
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -543,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
543
  cgh.parallel_for(
544
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
545
  [=](sycl::nd_item<3> item_ct1)
546
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
547
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
548
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
549
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -567,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
567
  cgh.parallel_for(
568
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
569
  [=](sycl::nd_item<3> item_ct1)
570
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
571
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
572
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
573
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -591,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
591
  cgh.parallel_for(
592
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
593
  [=](sycl::nd_item<3> item_ct1)
594
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
595
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
596
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
597
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -615,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
615
  cgh.parallel_for(
616
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
617
  [=](sycl::nd_item<3> item_ct1)
618
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
619
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
620
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
621
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -639,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
639
  cgh.parallel_for(
640
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
641
  [=](sycl::nd_item<3> item_ct1)
642
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
643
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
644
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
645
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -663,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
663
  cgh.parallel_for(
664
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
665
  [=](sycl::nd_item<3> item_ct1)
666
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
667
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
668
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
669
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -687,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
687
  cgh.parallel_for(
688
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
689
  [=](sycl::nd_item<3> item_ct1)
690
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
691
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
692
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
693
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -711,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
711
  cgh.parallel_for(
712
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
713
  [=](sycl::nd_item<3> item_ct1)
714
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
715
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
716
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
717
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -734,7 +734,7 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
734
  cgh.parallel_for(
735
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
736
  [=](sycl::nd_item<3> item_ct1)
737
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
738
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
739
  vx, vy, dst, ncols, nrows, item_ct1);
740
  });
@@ -755,7 +755,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
755
  cgh.parallel_for(
756
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
757
  [=](sycl::nd_item<3> item_ct1)
758
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
759
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
760
  vx, vy, dst, ncols, nrows, item_ct1);
761
  });
@@ -777,7 +777,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
777
  cgh.parallel_for(
778
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
779
  [=](sycl::nd_item<3> item_ct1)
780
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
781
  mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
782
  vx, vy, dst, ncols, nrows, item_ct1);
783
  });
@@ -799,7 +799,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
799
  cgh.parallel_for(
800
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
801
  [=](sycl::nd_item<3> item_ct1)
802
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
803
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
804
  vx, vy, dst, ncols, nrows, item_ct1);
805
  });
@@ -821,7 +821,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
821
  cgh.parallel_for(
822
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
823
  [=](sycl::nd_item<3> item_ct1)
824
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
825
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
826
  vx, vy, dst, ncols, nrows, item_ct1);
827
  });
@@ -843,7 +843,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
843
  cgh.parallel_for(
844
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
845
  [=](sycl::nd_item<3> item_ct1)
846
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
847
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
848
  vx, vy, dst, ncols, nrows, item_ct1);
849
  });
@@ -864,7 +864,7 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
864
  cgh.parallel_for(
865
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
866
  [=](sycl::nd_item<3> item_ct1)
867
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
868
  mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
869
  vx, vy, dst, ncols, nrows, item_ct1);
870
  });
@@ -886,7 +886,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
886
  cgh.parallel_for(
887
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
888
  [=](sycl::nd_item<3> item_ct1)
889
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
890
  mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
891
  vx, vy, dst, ncols, nrows, item_ct1);
892
  });
@@ -908,7 +908,7 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
908
  cgh.parallel_for(
909
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
910
  [=](sycl::nd_item<3> item_ct1)
911
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
912
  mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
913
  vx, vy, dst, ncols, nrows, item_ct1);
914
  });
@@ -1003,7 +1003,6 @@ void ggml_sycl_op_mul_mat_vec_q(
1003
  break;
1004
  default:
1005
  GGML_ABORT("fatal error");
1006
- break;
1007
  }
1008
  }
1009
  GGML_UNUSED(src1);
 
495
  cgh.parallel_for(
496
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
497
  [=](sycl::nd_item<3> item_ct1)
498
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
499
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
500
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
501
  vx, vy, dst, ncols, nrows, item_ct1);
 
519
  cgh.parallel_for(
520
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
521
  [=](sycl::nd_item<3> item_ct1)
522
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
523
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
524
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
525
  vx, vy, dst, ncols, nrows, item_ct1);
 
543
  cgh.parallel_for(
544
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
545
  [=](sycl::nd_item<3> item_ct1)
546
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
547
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
548
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
549
  vx, vy, dst, ncols, nrows, item_ct1);
 
567
  cgh.parallel_for(
568
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
569
  [=](sycl::nd_item<3> item_ct1)
570
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
571
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
572
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
573
  vx, vy, dst, ncols, nrows, item_ct1);
 
591
  cgh.parallel_for(
592
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
593
  [=](sycl::nd_item<3> item_ct1)
594
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
595
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
596
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
597
  vx, vy, dst, ncols, nrows, item_ct1);
 
615
  cgh.parallel_for(
616
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
617
  [=](sycl::nd_item<3> item_ct1)
618
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
619
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
620
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
621
  vx, vy, dst, ncols, nrows, item_ct1);
 
639
  cgh.parallel_for(
640
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
641
  [=](sycl::nd_item<3> item_ct1)
642
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
643
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
644
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
645
  vx, vy, dst, ncols, nrows, item_ct1);
 
663
  cgh.parallel_for(
664
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
665
  [=](sycl::nd_item<3> item_ct1)
666
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
667
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
668
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
669
  vx, vy, dst, ncols, nrows, item_ct1);
 
687
  cgh.parallel_for(
688
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
689
  [=](sycl::nd_item<3> item_ct1)
690
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
691
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
692
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
693
  vx, vy, dst, ncols, nrows, item_ct1);
 
711
  cgh.parallel_for(
712
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
713
  [=](sycl::nd_item<3> item_ct1)
714
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
715
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
716
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
717
  vx, vy, dst, ncols, nrows, item_ct1);
 
734
  cgh.parallel_for(
735
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
736
  [=](sycl::nd_item<3> item_ct1)
737
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
738
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
739
  vx, vy, dst, ncols, nrows, item_ct1);
740
  });
 
755
  cgh.parallel_for(
756
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
757
  [=](sycl::nd_item<3> item_ct1)
758
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
759
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
760
  vx, vy, dst, ncols, nrows, item_ct1);
761
  });
 
777
  cgh.parallel_for(
778
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
779
  [=](sycl::nd_item<3> item_ct1)
780
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
781
  mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
782
  vx, vy, dst, ncols, nrows, item_ct1);
783
  });
 
799
  cgh.parallel_for(
800
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
801
  [=](sycl::nd_item<3> item_ct1)
802
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
803
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
804
  vx, vy, dst, ncols, nrows, item_ct1);
805
  });
 
821
  cgh.parallel_for(
822
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
823
  [=](sycl::nd_item<3> item_ct1)
824
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
825
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
826
  vx, vy, dst, ncols, nrows, item_ct1);
827
  });
 
843
  cgh.parallel_for(
844
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
845
  [=](sycl::nd_item<3> item_ct1)
846
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
847
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
848
  vx, vy, dst, ncols, nrows, item_ct1);
849
  });
 
864
  cgh.parallel_for(
865
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
866
  [=](sycl::nd_item<3> item_ct1)
867
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
868
  mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
869
  vx, vy, dst, ncols, nrows, item_ct1);
870
  });
 
886
  cgh.parallel_for(
887
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
888
  [=](sycl::nd_item<3> item_ct1)
889
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
890
  mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
891
  vx, vy, dst, ncols, nrows, item_ct1);
892
  });
 
908
  cgh.parallel_for(
909
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
910
  [=](sycl::nd_item<3> item_ct1)
911
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
912
  mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
913
  vx, vy, dst, ncols, nrows, item_ct1);
914
  });
 
1003
  break;
1004
  default:
1005
  GGML_ABORT("fatal error");
 
1006
  }
1007
  }
1008
  GGML_UNUSED(src1);
ggml/src/ggml-sycl/norm.cpp CHANGED
@@ -235,7 +235,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
235
  sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
236
  block_dims),
237
  [=](sycl::nd_item<3> item_ct1)
238
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
239
  norm_f32(x, dst, ncols, eps, item_ct1,
240
  nullptr, WARP_SIZE);
241
  });
@@ -258,7 +258,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
258
  sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
259
  block_dims),
260
  [=](sycl::nd_item<3> item_ct1)
261
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
262
  norm_f32(x, dst, ncols, eps, item_ct1,
263
  get_pointer(s_sum_acc_ct1), work_group_size);
264
  });
@@ -277,7 +277,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
277
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
278
  block_dims),
279
  [=](sycl::nd_item<3> item_ct1)
280
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
281
  group_norm_f32(
282
  x, dst, group_size, ne_elements, eps_ct4, item_ct1,
283
  nullptr, WARP_SIZE);
@@ -304,7 +304,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
304
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
305
  block_dims),
306
  [=](sycl::nd_item<3> item_ct1)
307
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
308
  group_norm_f32(x, dst, group_size, ne_elements,
309
  eps_ct4, item_ct1,
310
  get_pointer(s_sum_acc_ct1), work_group_size);
@@ -325,7 +325,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
325
  sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
326
  block_dims),
327
  [=](sycl::nd_item<3> item_ct1)
328
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
329
  rms_norm_f32(x, dst, ncols, eps, item_ct1,
330
  nullptr, WARP_SIZE);
331
  });
@@ -347,7 +347,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
347
  sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
348
  block_dims),
349
  [=](sycl::nd_item<3> item_ct1)
350
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
351
  rms_norm_f32(x, dst, ncols, eps, item_ct1,
352
  get_pointer(s_sum_acc_ct1), work_group_size);
353
  });
 
235
  sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
236
  block_dims),
237
  [=](sycl::nd_item<3> item_ct1)
238
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
239
  norm_f32(x, dst, ncols, eps, item_ct1,
240
  nullptr, WARP_SIZE);
241
  });
 
258
  sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
259
  block_dims),
260
  [=](sycl::nd_item<3> item_ct1)
261
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
262
  norm_f32(x, dst, ncols, eps, item_ct1,
263
  get_pointer(s_sum_acc_ct1), work_group_size);
264
  });
 
277
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
278
  block_dims),
279
  [=](sycl::nd_item<3> item_ct1)
280
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
281
  group_norm_f32(
282
  x, dst, group_size, ne_elements, eps_ct4, item_ct1,
283
  nullptr, WARP_SIZE);
 
304
  sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
305
  block_dims),
306
  [=](sycl::nd_item<3> item_ct1)
307
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
308
  group_norm_f32(x, dst, group_size, ne_elements,
309
  eps_ct4, item_ct1,
310
  get_pointer(s_sum_acc_ct1), work_group_size);
 
325
  sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
326
  block_dims),
327
  [=](sycl::nd_item<3> item_ct1)
328
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
329
  rms_norm_f32(x, dst, ncols, eps, item_ct1,
330
  nullptr, WARP_SIZE);
331
  });
 
347
  sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
348
  block_dims),
349
  [=](sycl::nd_item<3> item_ct1)
350
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
351
  rms_norm_f32(x, dst, ncols, eps, item_ct1,
352
  get_pointer(s_sum_acc_ct1), work_group_size);
353
  });
ggml/src/ggml-sycl/softmax.cpp CHANGED
@@ -132,7 +132,7 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst,
132
 
133
  cgh.parallel_for(
134
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
135
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
136
  soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
137
  nrows_y, scale, max_bias, m0,
138
  m1, n_head_log2, item_ct1,
 
132
 
133
  cgh.parallel_for(
134
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
135
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
136
  soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
137
  nrows_y, scale, max_bias, m0,
138
  m1, n_head_log2, item_ct1,