Spaces:
Sleeping
Sleeping
Łukasz Ślusarczyk
commited on
Commit
·
77ff985
1
Parent(s):
727de7e
fixed compilation warnings in ggml-sycl (llama/12424)
Browse files- ggml/src/ggml-sycl/convert.cpp +1 -1
- ggml/src/ggml-sycl/dmmv.cpp +12 -13
- ggml/src/ggml-sycl/element_wise.cpp +40 -40
- ggml/src/ggml-sycl/getrows.cpp +1 -2
- ggml/src/ggml-sycl/ggml-sycl.cpp +21 -22
- ggml/src/ggml-sycl/mmq.cpp +0 -1
- ggml/src/ggml-sycl/mmvq.cpp +19 -20
- ggml/src/ggml-sycl/norm.cpp +6 -6
- ggml/src/ggml-sycl/softmax.cpp +1 -1
ggml/src/ggml-sycl/convert.cpp
CHANGED
|
@@ -138,7 +138,7 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int
|
|
| 138 |
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
|
| 139 |
sycl::range<3>(1, 1, WARP_SIZE),
|
| 140 |
sycl::range<3>(1, 1, WARP_SIZE)),
|
| 141 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 142 |
dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
|
| 143 |
});
|
| 144 |
|
|
|
|
| 138 |
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
|
| 139 |
sycl::range<3>(1, 1, WARP_SIZE),
|
| 140 |
sycl::range<3>(1, 1, WARP_SIZE)),
|
| 141 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
|
| 142 |
dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
|
| 143 |
});
|
| 144 |
|
ggml/src/ggml-sycl/dmmv.cpp
CHANGED
|
@@ -210,7 +210,7 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
|
| 210 |
|
| 211 |
stream->parallel_for(
|
| 212 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 213 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 214 |
dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
|
| 215 |
nrows, item_ct1);
|
| 216 |
});
|
|
@@ -879,7 +879,7 @@ static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloa
|
|
| 879 |
|
| 880 |
stream->parallel_for(
|
| 881 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 882 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 883 |
dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
|
| 884 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 885 |
});
|
|
@@ -902,7 +902,7 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
|
|
| 902 |
|
| 903 |
stream->parallel_for(
|
| 904 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 905 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 906 |
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
|
| 907 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 908 |
});
|
|
@@ -923,7 +923,7 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
|
|
| 923 |
|
| 924 |
stream->parallel_for(
|
| 925 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 926 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 927 |
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
|
| 928 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 929 |
});
|
|
@@ -944,7 +944,7 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
|
|
| 944 |
|
| 945 |
stream->parallel_for(
|
| 946 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 947 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 948 |
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
|
| 949 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 950 |
});
|
|
@@ -965,7 +965,7 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
|
|
| 965 |
|
| 966 |
stream->parallel_for(
|
| 967 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 968 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 969 |
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
|
| 970 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 971 |
});
|
|
@@ -986,7 +986,7 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
|
|
| 986 |
|
| 987 |
stream->parallel_for(
|
| 988 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 989 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 990 |
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
|
| 991 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 992 |
});
|
|
@@ -1004,7 +1004,7 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
|
|
| 1004 |
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
| 1005 |
stream->parallel_for(
|
| 1006 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1007 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 1008 |
dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
|
| 1009 |
});
|
| 1010 |
}
|
|
@@ -1020,7 +1020,7 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
|
|
| 1020 |
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
| 1021 |
stream->parallel_for(
|
| 1022 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1023 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 1024 |
dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
|
| 1025 |
});
|
| 1026 |
}
|
|
@@ -1036,7 +1036,7 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
|
|
| 1036 |
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
| 1037 |
stream->parallel_for(
|
| 1038 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1039 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 1040 |
dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
|
| 1041 |
});
|
| 1042 |
}
|
|
@@ -1049,7 +1049,7 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
|
|
| 1049 |
const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
|
| 1050 |
stream->parallel_for(
|
| 1051 |
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
| 1052 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 1053 |
dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
|
| 1054 |
});
|
| 1055 |
}
|
|
@@ -1065,7 +1065,7 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
|
|
| 1065 |
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
| 1066 |
stream->parallel_for(
|
| 1067 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1068 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 1069 |
dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
|
| 1070 |
});
|
| 1071 |
}
|
|
@@ -1143,7 +1143,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
| 1143 |
default:
|
| 1144 |
printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
|
| 1145 |
GGML_ABORT("fatal error");
|
| 1146 |
-
break;
|
| 1147 |
}
|
| 1148 |
|
| 1149 |
GGML_UNUSED(src1);
|
|
|
|
| 210 |
|
| 211 |
stream->parallel_for(
|
| 212 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 213 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 214 |
dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
|
| 215 |
nrows, item_ct1);
|
| 216 |
});
|
|
|
|
| 879 |
|
| 880 |
stream->parallel_for(
|
| 881 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 882 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 883 |
dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
|
| 884 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 885 |
});
|
|
|
|
| 902 |
|
| 903 |
stream->parallel_for(
|
| 904 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 905 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 906 |
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
|
| 907 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 908 |
});
|
|
|
|
| 923 |
|
| 924 |
stream->parallel_for(
|
| 925 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 926 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 927 |
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
|
| 928 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 929 |
});
|
|
|
|
| 944 |
|
| 945 |
stream->parallel_for(
|
| 946 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 947 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 948 |
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
|
| 949 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 950 |
});
|
|
|
|
| 965 |
|
| 966 |
stream->parallel_for(
|
| 967 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 968 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 969 |
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
|
| 970 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 971 |
});
|
|
|
|
| 986 |
|
| 987 |
stream->parallel_for(
|
| 988 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 989 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 990 |
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
|
| 991 |
vx, y, dst, ncols, nrows, item_ct1);
|
| 992 |
});
|
|
|
|
| 1004 |
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
| 1005 |
stream->parallel_for(
|
| 1006 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1007 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 1008 |
dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
|
| 1009 |
});
|
| 1010 |
}
|
|
|
|
| 1020 |
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
| 1021 |
stream->parallel_for(
|
| 1022 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1023 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 1024 |
dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
|
| 1025 |
});
|
| 1026 |
}
|
|
|
|
| 1036 |
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
| 1037 |
stream->parallel_for(
|
| 1038 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1039 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 1040 |
dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
|
| 1041 |
});
|
| 1042 |
}
|
|
|
|
| 1049 |
const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
|
| 1050 |
stream->parallel_for(
|
| 1051 |
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
| 1052 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 1053 |
dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
|
| 1054 |
});
|
| 1055 |
}
|
|
|
|
| 1065 |
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
|
| 1066 |
stream->parallel_for(
|
| 1067 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1068 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
| 1069 |
dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
|
| 1070 |
});
|
| 1071 |
}
|
|
|
|
| 1143 |
default:
|
| 1144 |
printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
|
| 1145 |
GGML_ABORT("fatal error");
|
|
|
|
| 1146 |
}
|
| 1147 |
|
| 1148 |
GGML_UNUSED(src1);
|
ggml/src/ggml-sycl/element_wise.cpp
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
#include "common.hpp"
|
| 2 |
#include "element_wise.hpp"
|
| 3 |
|
| 4 |
-
void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
| 5 |
const int ne10, const int ne11, const int ne12,
|
| 6 |
const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
|
| 7 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
@@ -20,7 +20,7 @@ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
|
| 20 |
}
|
| 21 |
}
|
| 22 |
|
| 23 |
-
void gelu_f32(const float * x, float * dst, const int k,
|
| 24 |
const sycl::nd_item<3> &item_ct1) {
|
| 25 |
const float GELU_COEF_A = 0.044715f;
|
| 26 |
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
|
@@ -37,7 +37,7 @@ void gelu_f32(const float * x, float * dst, const int k,
|
|
| 37 |
sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
|
| 38 |
}
|
| 39 |
|
| 40 |
-
void silu_f32(const float * x, float * dst, const int k,
|
| 41 |
const sycl::nd_item<3> &item_ct1) {
|
| 42 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 43 |
item_ct1.get_local_id(2);
|
|
@@ -48,7 +48,7 @@ void silu_f32(const float * x, float * dst, const int k,
|
|
| 48 |
dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
|
| 49 |
}
|
| 50 |
|
| 51 |
-
void gelu_quick_f32(const float *x, float *dst, int k,
|
| 52 |
const sycl::nd_item<3> &item_ct1) {
|
| 53 |
const float GELU_QUICK_COEF = -1.702f;
|
| 54 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
@@ -59,7 +59,7 @@ void gelu_quick_f32(const float *x, float *dst, int k,
|
|
| 59 |
dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
|
| 60 |
}
|
| 61 |
|
| 62 |
-
void tanh_f32(const float *x, float *dst, int k,
|
| 63 |
const sycl::nd_item<3> &item_ct1) {
|
| 64 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 65 |
item_ct1.get_local_id(2);
|
|
@@ -69,7 +69,7 @@ void tanh_f32(const float *x, float *dst, int k,
|
|
| 69 |
dst[i] = sycl::tanh((float)(x[i]));
|
| 70 |
}
|
| 71 |
|
| 72 |
-
void relu_f32(const float * x, float * dst, const int k,
|
| 73 |
const sycl::nd_item<3> &item_ct1) {
|
| 74 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 75 |
item_ct1.get_local_id(2);
|
|
@@ -80,7 +80,7 @@ void relu_f32(const float * x, float * dst, const int k,
|
|
| 80 |
dst[i] = sycl::fmax((float)(x[i]), (float)0);
|
| 81 |
}
|
| 82 |
|
| 83 |
-
void sigmoid_f32(const float * x, float * dst, const int k,
|
| 84 |
const sycl::nd_item<3> &item_ct1) {
|
| 85 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 86 |
item_ct1.get_local_id(2);
|
|
@@ -91,7 +91,7 @@ void sigmoid_f32(const float * x, float * dst, const int k,
|
|
| 91 |
dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
|
| 92 |
}
|
| 93 |
|
| 94 |
-
void sqrt_f32(const float * x, float * dst, const int k,
|
| 95 |
const sycl::nd_item<3> &item_ct1) {
|
| 96 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 97 |
item_ct1.get_local_id(2);
|
|
@@ -102,7 +102,7 @@ void sqrt_f32(const float * x, float * dst, const int k,
|
|
| 102 |
dst[i] = sycl::sqrt(x[i]);
|
| 103 |
}
|
| 104 |
|
| 105 |
-
void sin_f32(const float * x, float * dst, const int k,
|
| 106 |
const sycl::nd_item<3> &item_ct1) {
|
| 107 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 108 |
item_ct1.get_local_id(2);
|
|
@@ -113,7 +113,7 @@ void sin_f32(const float * x, float * dst, const int k,
|
|
| 113 |
dst[i] = sycl::sin(x[i]);
|
| 114 |
}
|
| 115 |
|
| 116 |
-
void cos_f32(const float * x, float * dst, const int k,
|
| 117 |
const sycl::nd_item<3> &item_ct1) {
|
| 118 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 119 |
item_ct1.get_local_id(2);
|
|
@@ -124,7 +124,7 @@ void cos_f32(const float * x, float * dst, const int k,
|
|
| 124 |
dst[i] = sycl::cos(x[i]);
|
| 125 |
}
|
| 126 |
|
| 127 |
-
void hardsigmoid_f32(const float * x, float * dst, const int k,
|
| 128 |
const sycl::nd_item<3> &item_ct1) {
|
| 129 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 130 |
item_ct1.get_local_id(2);
|
|
@@ -135,7 +135,7 @@ void hardsigmoid_f32(const float * x, float * dst, const int k,
|
|
| 135 |
dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
|
| 136 |
}
|
| 137 |
|
| 138 |
-
void hardswish_f32(const float * x, float * dst, const int k,
|
| 139 |
const sycl::nd_item<3> &item_ct1) {
|
| 140 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 141 |
item_ct1.get_local_id(2);
|
|
@@ -146,7 +146,7 @@ void hardswish_f32(const float * x, float * dst, const int k,
|
|
| 146 |
dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
|
| 147 |
}
|
| 148 |
|
| 149 |
-
void exp_f32(const float * x, float * dst, const int k,
|
| 150 |
const sycl::nd_item<3> &item_ct1) {
|
| 151 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 152 |
item_ct1.get_local_id(2);
|
|
@@ -157,7 +157,7 @@ void exp_f32(const float * x, float * dst, const int k,
|
|
| 157 |
dst[i] = sycl::exp(x[i]);
|
| 158 |
}
|
| 159 |
|
| 160 |
-
void log_f32(const float * x, float * dst, const int k,
|
| 161 |
const sycl::nd_item<3> &item_ct1) {
|
| 162 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 163 |
item_ct1.get_local_id(2);
|
|
@@ -173,7 +173,7 @@ void log_f32(const float * x, float * dst, const int k,
|
|
| 173 |
}
|
| 174 |
}
|
| 175 |
|
| 176 |
-
void neg_f32(const float * x, float * dst, const int k,
|
| 177 |
const sycl::nd_item<3> &item_ct1) {
|
| 178 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 179 |
item_ct1.get_local_id(2);
|
|
@@ -184,7 +184,7 @@ void neg_f32(const float * x, float * dst, const int k,
|
|
| 184 |
dst[i] = -x[i];
|
| 185 |
}
|
| 186 |
|
| 187 |
-
void step_f32(const float * x, float * dst, const int k,
|
| 188 |
const sycl::nd_item<3> &item_ct1) {
|
| 189 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 190 |
item_ct1.get_local_id(2);
|
|
@@ -195,7 +195,7 @@ void step_f32(const float * x, float * dst, const int k,
|
|
| 195 |
dst[i] = x[i] > 0.0f;
|
| 196 |
}
|
| 197 |
|
| 198 |
-
void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
|
| 199 |
const sycl::nd_item<3> &item_ct1) {
|
| 200 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 201 |
item_ct1.get_local_id(2);
|
|
@@ -206,7 +206,7 @@ void leaky_relu_f32(const float *x, float *dst, const int k, const float negativ
|
|
| 206 |
sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
|
| 207 |
}
|
| 208 |
|
| 209 |
-
void sqr_f32(const float * x, float * dst, const int k,
|
| 210 |
const sycl::nd_item<3> &item_ct1) {
|
| 211 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 212 |
item_ct1.get_local_id(2);
|
|
@@ -217,7 +217,7 @@ void sqr_f32(const float * x, float * dst, const int k,
|
|
| 217 |
dst[i] = x[i] * x[i];
|
| 218 |
}
|
| 219 |
|
| 220 |
-
void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
| 221 |
const int nb02, const int nb03, const int ne10, const int ne11,
|
| 222 |
const int ne12, const int ne13, const float sf0, const float sf1,
|
| 223 |
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
|
@@ -240,7 +240,7 @@ void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
|
| 240 |
dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
| 241 |
}
|
| 242 |
|
| 243 |
-
void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
| 244 |
const sycl::nd_item<3> &item_ct1) {
|
| 245 |
int nidx = item_ct1.get_local_id(2) +
|
| 246 |
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
|
@@ -262,7 +262,7 @@ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const i
|
|
| 262 |
|
| 263 |
|
| 264 |
|
| 265 |
-
void acc_f32_sycl(const float *x, const float *y, float *dst,
|
| 266 |
const int n_elements, const int ne10, const int ne11,
|
| 267 |
const int ne12, const int nb1, const int nb2,
|
| 268 |
const int offset, queue_ptr stream) {
|
|
@@ -277,7 +277,7 @@ void acc_f32_sycl(const float *x, const float *y, float *dst,
|
|
| 277 |
});
|
| 278 |
}
|
| 279 |
|
| 280 |
-
void gelu_f32_sycl(const float *x, float *dst, const int k,
|
| 281 |
queue_ptr stream) {
|
| 282 |
const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
|
| 283 |
stream->parallel_for(
|
|
@@ -289,7 +289,7 @@ void gelu_f32_sycl(const float *x, float *dst, const int k,
|
|
| 289 |
});
|
| 290 |
}
|
| 291 |
|
| 292 |
-
void silu_f32_sycl(const float *x, float *dst, const int k,
|
| 293 |
queue_ptr stream) {
|
| 294 |
const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
|
| 295 |
stream->parallel_for(
|
|
@@ -301,7 +301,7 @@ void silu_f32_sycl(const float *x, float *dst, const int k,
|
|
| 301 |
});
|
| 302 |
}
|
| 303 |
|
| 304 |
-
void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
|
| 305 |
queue_ptr stream) {
|
| 306 |
const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
|
| 307 |
stream->parallel_for(
|
|
@@ -313,7 +313,7 @@ void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
|
|
| 313 |
});
|
| 314 |
}
|
| 315 |
|
| 316 |
-
void tanh_f32_sycl(const float *x, float *dst, const int k,
|
| 317 |
queue_ptr stream) {
|
| 318 |
const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
|
| 319 |
stream->parallel_for(
|
|
@@ -325,7 +325,7 @@ void tanh_f32_sycl(const float *x, float *dst, const int k,
|
|
| 325 |
});
|
| 326 |
}
|
| 327 |
|
| 328 |
-
void relu_f32_sycl(const float *x, float *dst, const int k,
|
| 329 |
queue_ptr stream) {
|
| 330 |
const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
|
| 331 |
stream->parallel_for(
|
|
@@ -337,7 +337,7 @@ void relu_f32_sycl(const float *x, float *dst, const int k,
|
|
| 337 |
});
|
| 338 |
}
|
| 339 |
|
| 340 |
-
void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
|
| 341 |
queue_ptr stream) {
|
| 342 |
const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
|
| 343 |
stream->parallel_for(
|
|
@@ -349,7 +349,7 @@ void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
| 349 |
});
|
| 350 |
}
|
| 351 |
|
| 352 |
-
void hardswish_f32_sycl(const float *x, float *dst, const int k,
|
| 353 |
queue_ptr stream) {
|
| 354 |
const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
|
| 355 |
stream->parallel_for(
|
|
@@ -361,7 +361,7 @@ void hardswish_f32_sycl(const float *x, float *dst, const int k,
|
|
| 361 |
});
|
| 362 |
}
|
| 363 |
|
| 364 |
-
void exp_f32_sycl(const float *x, float *dst, const int k,
|
| 365 |
queue_ptr stream) {
|
| 366 |
const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
|
| 367 |
stream->parallel_for(
|
|
@@ -373,7 +373,7 @@ void exp_f32_sycl(const float *x, float *dst, const int k,
|
|
| 373 |
});
|
| 374 |
}
|
| 375 |
|
| 376 |
-
void log_f32_sycl(const float *x, float *dst, const int k,
|
| 377 |
queue_ptr stream) {
|
| 378 |
const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
|
| 379 |
stream->parallel_for(
|
|
@@ -385,7 +385,7 @@ void log_f32_sycl(const float *x, float *dst, const int k,
|
|
| 385 |
});
|
| 386 |
}
|
| 387 |
|
| 388 |
-
void neg_f32_sycl(const float *x, float *dst, const int k,
|
| 389 |
queue_ptr stream) {
|
| 390 |
const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
|
| 391 |
stream->parallel_for(
|
|
@@ -397,7 +397,7 @@ void neg_f32_sycl(const float *x, float *dst, const int k,
|
|
| 397 |
});
|
| 398 |
}
|
| 399 |
|
| 400 |
-
void step_f32_sycl(const float *x, float *dst, const int k,
|
| 401 |
queue_ptr stream) {
|
| 402 |
const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
|
| 403 |
stream->parallel_for(
|
|
@@ -409,7 +409,7 @@ void step_f32_sycl(const float *x, float *dst, const int k,
|
|
| 409 |
});
|
| 410 |
}
|
| 411 |
|
| 412 |
-
void sigmoid_f32_sycl(const float *x, float *dst, const int k,
|
| 413 |
queue_ptr stream) {
|
| 414 |
const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
|
| 415 |
stream->parallel_for(
|
|
@@ -421,7 +421,7 @@ void sigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
| 421 |
});
|
| 422 |
}
|
| 423 |
|
| 424 |
-
void sqrt_f32_sycl(const float *x, float *dst, const int k,
|
| 425 |
queue_ptr stream) {
|
| 426 |
const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
|
| 427 |
stream->parallel_for(
|
|
@@ -433,7 +433,7 @@ void sqrt_f32_sycl(const float *x, float *dst, const int k,
|
|
| 433 |
});
|
| 434 |
}
|
| 435 |
|
| 436 |
-
void sin_f32_sycl(const float *x, float *dst, const int k,
|
| 437 |
queue_ptr stream) {
|
| 438 |
const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
|
| 439 |
stream->parallel_for(
|
|
@@ -445,7 +445,7 @@ void sin_f32_sycl(const float *x, float *dst, const int k,
|
|
| 445 |
});
|
| 446 |
}
|
| 447 |
|
| 448 |
-
void cos_f32_sycl(const float *x, float *dst, const int k,
|
| 449 |
queue_ptr stream) {
|
| 450 |
const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
|
| 451 |
stream->parallel_for(
|
|
@@ -457,7 +457,7 @@ void cos_f32_sycl(const float *x, float *dst, const int k,
|
|
| 457 |
});
|
| 458 |
}
|
| 459 |
|
| 460 |
-
void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
|
| 461 |
const float negative_slope,
|
| 462 |
queue_ptr stream) {
|
| 463 |
const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
|
|
@@ -470,7 +470,7 @@ void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
|
|
| 470 |
});
|
| 471 |
}
|
| 472 |
|
| 473 |
-
void sqr_f32_sycl(const float *x, float *dst, const int k,
|
| 474 |
queue_ptr stream) {
|
| 475 |
const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
|
| 476 |
stream->parallel_for(
|
|
@@ -482,7 +482,7 @@ void sqr_f32_sycl(const float *x, float *dst, const int k,
|
|
| 482 |
});
|
| 483 |
}
|
| 484 |
|
| 485 |
-
void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
| 486 |
const int nb02, const int nb03, const int ne10, const int ne11,
|
| 487 |
const int ne12, const int ne13, const float sf0, const float sf1,
|
| 488 |
const float sf2, const float sf3, queue_ptr stream) {
|
|
@@ -496,7 +496,7 @@ void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01
|
|
| 496 |
});
|
| 497 |
}
|
| 498 |
|
| 499 |
-
void pad_f32_sycl(const float *x, float *dst, const int ne00,
|
| 500 |
const int ne01, const int ne02, const int ne0,
|
| 501 |
const int ne1, const int ne2, queue_ptr stream) {
|
| 502 |
int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
|
|
|
|
| 1 |
#include "common.hpp"
|
| 2 |
#include "element_wise.hpp"
|
| 3 |
|
| 4 |
+
static void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
| 5 |
const int ne10, const int ne11, const int ne12,
|
| 6 |
const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
|
| 7 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
|
| 20 |
}
|
| 21 |
}
|
| 22 |
|
| 23 |
+
static void gelu_f32(const float * x, float * dst, const int k,
|
| 24 |
const sycl::nd_item<3> &item_ct1) {
|
| 25 |
const float GELU_COEF_A = 0.044715f;
|
| 26 |
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
|
|
|
| 37 |
sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
|
| 38 |
}
|
| 39 |
|
| 40 |
+
static void silu_f32(const float * x, float * dst, const int k,
|
| 41 |
const sycl::nd_item<3> &item_ct1) {
|
| 42 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 43 |
item_ct1.get_local_id(2);
|
|
|
|
| 48 |
dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
|
| 49 |
}
|
| 50 |
|
| 51 |
+
static void gelu_quick_f32(const float *x, float *dst, int k,
|
| 52 |
const sycl::nd_item<3> &item_ct1) {
|
| 53 |
const float GELU_QUICK_COEF = -1.702f;
|
| 54 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
|
| 59 |
dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
|
| 60 |
}
|
| 61 |
|
| 62 |
+
static void tanh_f32(const float *x, float *dst, int k,
|
| 63 |
const sycl::nd_item<3> &item_ct1) {
|
| 64 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 65 |
item_ct1.get_local_id(2);
|
|
|
|
| 69 |
dst[i] = sycl::tanh((float)(x[i]));
|
| 70 |
}
|
| 71 |
|
| 72 |
+
static void relu_f32(const float * x, float * dst, const int k,
|
| 73 |
const sycl::nd_item<3> &item_ct1) {
|
| 74 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 75 |
item_ct1.get_local_id(2);
|
|
|
|
| 80 |
dst[i] = sycl::fmax((float)(x[i]), (float)0);
|
| 81 |
}
|
| 82 |
|
| 83 |
+
static void sigmoid_f32(const float * x, float * dst, const int k,
|
| 84 |
const sycl::nd_item<3> &item_ct1) {
|
| 85 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 86 |
item_ct1.get_local_id(2);
|
|
|
|
| 91 |
dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
|
| 92 |
}
|
| 93 |
|
| 94 |
+
static void sqrt_f32(const float * x, float * dst, const int k,
|
| 95 |
const sycl::nd_item<3> &item_ct1) {
|
| 96 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 97 |
item_ct1.get_local_id(2);
|
|
|
|
| 102 |
dst[i] = sycl::sqrt(x[i]);
|
| 103 |
}
|
| 104 |
|
| 105 |
+
static void sin_f32(const float * x, float * dst, const int k,
|
| 106 |
const sycl::nd_item<3> &item_ct1) {
|
| 107 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 108 |
item_ct1.get_local_id(2);
|
|
|
|
| 113 |
dst[i] = sycl::sin(x[i]);
|
| 114 |
}
|
| 115 |
|
| 116 |
+
static void cos_f32(const float * x, float * dst, const int k,
|
| 117 |
const sycl::nd_item<3> &item_ct1) {
|
| 118 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 119 |
item_ct1.get_local_id(2);
|
|
|
|
| 124 |
dst[i] = sycl::cos(x[i]);
|
| 125 |
}
|
| 126 |
|
| 127 |
+
static void hardsigmoid_f32(const float * x, float * dst, const int k,
|
| 128 |
const sycl::nd_item<3> &item_ct1) {
|
| 129 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 130 |
item_ct1.get_local_id(2);
|
|
|
|
| 135 |
dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
|
| 136 |
}
|
| 137 |
|
| 138 |
+
static void hardswish_f32(const float * x, float * dst, const int k,
|
| 139 |
const sycl::nd_item<3> &item_ct1) {
|
| 140 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 141 |
item_ct1.get_local_id(2);
|
|
|
|
| 146 |
dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
|
| 147 |
}
|
| 148 |
|
| 149 |
+
static void exp_f32(const float * x, float * dst, const int k,
|
| 150 |
const sycl::nd_item<3> &item_ct1) {
|
| 151 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 152 |
item_ct1.get_local_id(2);
|
|
|
|
| 157 |
dst[i] = sycl::exp(x[i]);
|
| 158 |
}
|
| 159 |
|
| 160 |
+
static void log_f32(const float * x, float * dst, const int k,
|
| 161 |
const sycl::nd_item<3> &item_ct1) {
|
| 162 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 163 |
item_ct1.get_local_id(2);
|
|
|
|
| 173 |
}
|
| 174 |
}
|
| 175 |
|
| 176 |
+
static void neg_f32(const float * x, float * dst, const int k,
|
| 177 |
const sycl::nd_item<3> &item_ct1) {
|
| 178 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 179 |
item_ct1.get_local_id(2);
|
|
|
|
| 184 |
dst[i] = -x[i];
|
| 185 |
}
|
| 186 |
|
| 187 |
+
static void step_f32(const float * x, float * dst, const int k,
|
| 188 |
const sycl::nd_item<3> &item_ct1) {
|
| 189 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 190 |
item_ct1.get_local_id(2);
|
|
|
|
| 195 |
dst[i] = x[i] > 0.0f;
|
| 196 |
}
|
| 197 |
|
| 198 |
+
static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
|
| 199 |
const sycl::nd_item<3> &item_ct1) {
|
| 200 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 201 |
item_ct1.get_local_id(2);
|
|
|
|
| 206 |
sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
|
| 207 |
}
|
| 208 |
|
| 209 |
+
static void sqr_f32(const float * x, float * dst, const int k,
|
| 210 |
const sycl::nd_item<3> &item_ct1) {
|
| 211 |
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 212 |
item_ct1.get_local_id(2);
|
|
|
|
| 217 |
dst[i] = x[i] * x[i];
|
| 218 |
}
|
| 219 |
|
| 220 |
+
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
| 221 |
const int nb02, const int nb03, const int ne10, const int ne11,
|
| 222 |
const int ne12, const int ne13, const float sf0, const float sf1,
|
| 223 |
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
|
|
|
| 240 |
dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
| 241 |
}
|
| 242 |
|
| 243 |
+
static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
| 244 |
const sycl::nd_item<3> &item_ct1) {
|
| 245 |
int nidx = item_ct1.get_local_id(2) +
|
| 246 |
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
|
|
|
| 262 |
|
| 263 |
|
| 264 |
|
| 265 |
+
static void acc_f32_sycl(const float *x, const float *y, float *dst,
|
| 266 |
const int n_elements, const int ne10, const int ne11,
|
| 267 |
const int ne12, const int nb1, const int nb2,
|
| 268 |
const int offset, queue_ptr stream) {
|
|
|
|
| 277 |
});
|
| 278 |
}
|
| 279 |
|
| 280 |
+
static void gelu_f32_sycl(const float *x, float *dst, const int k,
|
| 281 |
queue_ptr stream) {
|
| 282 |
const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
|
| 283 |
stream->parallel_for(
|
|
|
|
| 289 |
});
|
| 290 |
}
|
| 291 |
|
| 292 |
+
static void silu_f32_sycl(const float *x, float *dst, const int k,
|
| 293 |
queue_ptr stream) {
|
| 294 |
const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
|
| 295 |
stream->parallel_for(
|
|
|
|
| 301 |
});
|
| 302 |
}
|
| 303 |
|
| 304 |
+
static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
|
| 305 |
queue_ptr stream) {
|
| 306 |
const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
|
| 307 |
stream->parallel_for(
|
|
|
|
| 313 |
});
|
| 314 |
}
|
| 315 |
|
| 316 |
+
static void tanh_f32_sycl(const float *x, float *dst, const int k,
|
| 317 |
queue_ptr stream) {
|
| 318 |
const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
|
| 319 |
stream->parallel_for(
|
|
|
|
| 325 |
});
|
| 326 |
}
|
| 327 |
|
| 328 |
+
static void relu_f32_sycl(const float *x, float *dst, const int k,
|
| 329 |
queue_ptr stream) {
|
| 330 |
const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
|
| 331 |
stream->parallel_for(
|
|
|
|
| 337 |
});
|
| 338 |
}
|
| 339 |
|
| 340 |
+
static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
|
| 341 |
queue_ptr stream) {
|
| 342 |
const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
|
| 343 |
stream->parallel_for(
|
|
|
|
| 349 |
});
|
| 350 |
}
|
| 351 |
|
| 352 |
+
static void hardswish_f32_sycl(const float *x, float *dst, const int k,
|
| 353 |
queue_ptr stream) {
|
| 354 |
const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
|
| 355 |
stream->parallel_for(
|
|
|
|
| 361 |
});
|
| 362 |
}
|
| 363 |
|
| 364 |
+
static void exp_f32_sycl(const float *x, float *dst, const int k,
|
| 365 |
queue_ptr stream) {
|
| 366 |
const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
|
| 367 |
stream->parallel_for(
|
|
|
|
| 373 |
});
|
| 374 |
}
|
| 375 |
|
| 376 |
+
static void log_f32_sycl(const float *x, float *dst, const int k,
|
| 377 |
queue_ptr stream) {
|
| 378 |
const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
|
| 379 |
stream->parallel_for(
|
|
|
|
| 385 |
});
|
| 386 |
}
|
| 387 |
|
| 388 |
+
static void neg_f32_sycl(const float *x, float *dst, const int k,
|
| 389 |
queue_ptr stream) {
|
| 390 |
const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
|
| 391 |
stream->parallel_for(
|
|
|
|
| 397 |
});
|
| 398 |
}
|
| 399 |
|
| 400 |
+
static void step_f32_sycl(const float *x, float *dst, const int k,
|
| 401 |
queue_ptr stream) {
|
| 402 |
const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
|
| 403 |
stream->parallel_for(
|
|
|
|
| 409 |
});
|
| 410 |
}
|
| 411 |
|
| 412 |
+
static void sigmoid_f32_sycl(const float *x, float *dst, const int k,
|
| 413 |
queue_ptr stream) {
|
| 414 |
const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
|
| 415 |
stream->parallel_for(
|
|
|
|
| 421 |
});
|
| 422 |
}
|
| 423 |
|
| 424 |
+
static void sqrt_f32_sycl(const float *x, float *dst, const int k,
|
| 425 |
queue_ptr stream) {
|
| 426 |
const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
|
| 427 |
stream->parallel_for(
|
|
|
|
| 433 |
});
|
| 434 |
}
|
| 435 |
|
| 436 |
+
static void sin_f32_sycl(const float *x, float *dst, const int k,
|
| 437 |
queue_ptr stream) {
|
| 438 |
const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
|
| 439 |
stream->parallel_for(
|
|
|
|
| 445 |
});
|
| 446 |
}
|
| 447 |
|
| 448 |
+
static void cos_f32_sycl(const float *x, float *dst, const int k,
|
| 449 |
queue_ptr stream) {
|
| 450 |
const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
|
| 451 |
stream->parallel_for(
|
|
|
|
| 457 |
});
|
| 458 |
}
|
| 459 |
|
| 460 |
+
static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
|
| 461 |
const float negative_slope,
|
| 462 |
queue_ptr stream) {
|
| 463 |
const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
|
|
|
|
| 470 |
});
|
| 471 |
}
|
| 472 |
|
| 473 |
+
static void sqr_f32_sycl(const float *x, float *dst, const int k,
|
| 474 |
queue_ptr stream) {
|
| 475 |
const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
|
| 476 |
stream->parallel_for(
|
|
|
|
| 482 |
});
|
| 483 |
}
|
| 484 |
|
| 485 |
+
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
| 486 |
const int nb02, const int nb03, const int ne10, const int ne11,
|
| 487 |
const int ne12, const int ne13, const float sf0, const float sf1,
|
| 488 |
const float sf2, const float sf3, queue_ptr stream) {
|
|
|
|
| 496 |
});
|
| 497 |
}
|
| 498 |
|
| 499 |
+
static void pad_f32_sycl(const float *x, float *dst, const int ne00,
|
| 500 |
const int ne01, const int ne02, const int ne0,
|
| 501 |
const int ne1, const int ne2, queue_ptr stream) {
|
| 502 |
int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
|
ggml/src/ggml-sycl/getrows.cpp
CHANGED
|
@@ -207,7 +207,7 @@ static void get_rows_sycl_reorder(ggml_backend_sycl_context & ctx, const ggml_te
|
|
| 207 |
const size_t nrows = ne01;
|
| 208 |
const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2);
|
| 209 |
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 210 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 211 |
k_get_rows_reorder<qk, qr, dq_reorder>(
|
| 212 |
src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
| 213 |
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
|
@@ -302,7 +302,6 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *s
|
|
| 302 |
// TODO: k-quants
|
| 303 |
GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
| 304 |
GGML_ABORT("fatal error");
|
| 305 |
-
break;
|
| 306 |
}
|
| 307 |
}
|
| 308 |
|
|
|
|
| 207 |
const size_t nrows = ne01;
|
| 208 |
const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2);
|
| 209 |
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 210 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
|
| 211 |
k_get_rows_reorder<qk, qr, dq_reorder>(
|
| 212 |
src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
| 213 |
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
|
|
|
| 302 |
// TODO: k-quants
|
| 303 |
GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
| 304 |
GGML_ABORT("fatal error");
|
|
|
|
| 305 |
}
|
| 306 |
}
|
| 307 |
|
ggml/src/ggml-sycl/ggml-sycl.cpp
CHANGED
|
@@ -95,7 +95,7 @@ const ggml_sycl_device_info & ggml_sycl_info() {
|
|
| 95 |
return info;
|
| 96 |
}
|
| 97 |
|
| 98 |
-
void print_device_detail(int id, sycl::device &device, std::string device_type) {
|
| 99 |
|
| 100 |
dpct::device_info prop;
|
| 101 |
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
@@ -118,7 +118,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
|
| 118 |
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
|
| 119 |
}
|
| 120 |
|
| 121 |
-
void print_device_opt_feature(int device_count) {
|
| 122 |
GGML_LOG_INFO("SYCL Optimization Feature:\n");
|
| 123 |
GGML_LOG_INFO(
|
| 124 |
"|ID| Device Type|Reorder|\n");
|
|
@@ -401,7 +401,7 @@ catch (sycl::exception const &exc) {
|
|
| 401 |
std::exit(1);
|
| 402 |
}
|
| 403 |
|
| 404 |
-
void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
| 405 |
const void *ptr_src, size_t size) {
|
| 406 |
char *host_buf = (char *)malloc(size);
|
| 407 |
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
|
|
@@ -620,7 +620,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|
| 620 |
return &ggml_backend_sycl_buffer_types[device];
|
| 621 |
}
|
| 622 |
|
| 623 |
-
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
|
| 624 |
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
| 625 |
|
| 626 |
int device = ctx->device;
|
|
@@ -1682,7 +1682,7 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
|
| 1682 |
|
| 1683 |
stream->parallel_for(
|
| 1684 |
sycl::nd_range<3>(num_blocks * block_size, block_size),
|
| 1685 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 1686 |
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
| 1687 |
});
|
| 1688 |
}
|
|
@@ -1703,7 +1703,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
|
| 1703 |
|
| 1704 |
stream->parallel_for(
|
| 1705 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1706 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 1707 |
mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
|
| 1708 |
nchannels_y, item_ct1);
|
| 1709 |
});
|
|
@@ -1723,7 +1723,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
|
| 1723 |
|
| 1724 |
stream->parallel_for(
|
| 1725 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1726 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 1727 |
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
|
| 1728 |
row_stride_x, channel_stride_x,
|
| 1729 |
nchannels_y / nchannels_x, item_ct1);
|
|
@@ -1764,7 +1764,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
|
| 1764 |
const sycl::range<3> block_nums(1, nrows, 1);
|
| 1765 |
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1766 |
[=](sycl::nd_item<3> item_ct1)
|
| 1767 |
-
[[
|
| 1768 |
k_sum_rows_f32(x, dst, ncols, item_ct1);
|
| 1769 |
});
|
| 1770 |
}
|
|
@@ -2920,7 +2920,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
|
| 2920 |
return false;
|
| 2921 |
}
|
| 2922 |
|
| 2923 |
-
bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
| 2924 |
switch (type) {
|
| 2925 |
case GGML_TYPE_Q4_0:
|
| 2926 |
case GGML_TYPE_Q4_1:
|
|
@@ -3293,7 +3293,7 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
|
| 3293 |
}
|
| 3294 |
|
| 3295 |
|
| 3296 |
-
void ggml_sycl_set_main_device(const int main_device) try {
|
| 3297 |
if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
|
| 3298 |
return;
|
| 3299 |
}
|
|
@@ -3314,7 +3314,7 @@ catch (sycl::exception const &exc) {
|
|
| 3314 |
std::exit(1);
|
| 3315 |
}
|
| 3316 |
|
| 3317 |
-
bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
|
| 3318 |
if (!g_sycl_loaded) return false;
|
| 3319 |
|
| 3320 |
if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
|
|
@@ -3638,7 +3638,7 @@ catch (sycl::exception const &exc) {
|
|
| 3638 |
std::exit(1);
|
| 3639 |
}
|
| 3640 |
|
| 3641 |
-
void reorder_qw(char *data_device, const int ncols, const int nrows,
|
| 3642 |
size_t size, size_t offset, dpct::queue_ptr stream) {
|
| 3643 |
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
| 3644 |
SYCL_CHECK(
|
|
@@ -3652,7 +3652,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
|
|
| 3652 |
|
| 3653 |
stream->parallel_for(
|
| 3654 |
size / sizeof(block_q4_0),
|
| 3655 |
-
[=](auto i) [[
|
| 3656 |
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
| 3657 |
const int ib = i;
|
| 3658 |
|
|
@@ -3666,7 +3666,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
|
|
| 3666 |
sycl::free(tmp_buf, *stream);
|
| 3667 |
}
|
| 3668 |
|
| 3669 |
-
void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
|
| 3670 |
char*data_device = (char*)src0->data;
|
| 3671 |
size_t ncols = src0->ne[0];
|
| 3672 |
size_t nrows = src0->ne[1];
|
|
@@ -3675,7 +3675,7 @@ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
| 3675 |
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
| 3676 |
}
|
| 3677 |
|
| 3678 |
-
void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
|
| 3679 |
ggml_tensor *src0 = dst->src[0];
|
| 3680 |
ggml_tensor *src1 = dst->src[1];
|
| 3681 |
|
|
@@ -3688,7 +3688,7 @@ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
|
|
| 3688 |
}
|
| 3689 |
}
|
| 3690 |
|
| 3691 |
-
void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
|
| 3692 |
dpct::queue_ptr stream = ctx->stream();
|
| 3693 |
if (ctx->optimized_graph) {
|
| 3694 |
return;
|
|
@@ -3878,7 +3878,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3878 |
return true;
|
| 3879 |
}
|
| 3880 |
return false;
|
| 3881 |
-
}
|
| 3882 |
case GGML_OP_UNARY:
|
| 3883 |
switch (ggml_get_unary_op(op)) {
|
| 3884 |
case GGML_UNARY_OP_NEG:
|
|
@@ -3896,7 +3896,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3896 |
default:
|
| 3897 |
return false;
|
| 3898 |
}
|
| 3899 |
-
break;
|
| 3900 |
case GGML_OP_MUL_MAT:
|
| 3901 |
case GGML_OP_MUL_MAT_ID:
|
| 3902 |
{
|
|
@@ -3927,7 +3926,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3927 |
return false;
|
| 3928 |
}
|
| 3929 |
return true;
|
| 3930 |
-
}
|
| 3931 |
case GGML_OP_OUT_PROD:
|
| 3932 |
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
|
| 3933 |
case GGML_OP_GET_ROWS:
|
|
@@ -3944,7 +3943,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3944 |
default:
|
| 3945 |
return false;
|
| 3946 |
}
|
| 3947 |
-
}
|
| 3948 |
case GGML_OP_CPY:
|
| 3949 |
{
|
| 3950 |
ggml_type src0_type = op->src[0]->type;
|
|
@@ -3995,12 +3994,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3995 |
return true;
|
| 3996 |
}
|
| 3997 |
return false;
|
| 3998 |
-
}
|
| 3999 |
case GGML_OP_CONCAT:
|
| 4000 |
{
|
| 4001 |
ggml_type src0_type = op->src[0]->type;
|
| 4002 |
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
| 4003 |
-
}
|
| 4004 |
case GGML_OP_DUP:
|
| 4005 |
case GGML_OP_ARGMAX:
|
| 4006 |
case GGML_OP_NONE:
|
|
|
|
| 95 |
return info;
|
| 96 |
}
|
| 97 |
|
| 98 |
+
static void print_device_detail(int id, sycl::device &device, std::string device_type) {
|
| 99 |
|
| 100 |
dpct::device_info prop;
|
| 101 |
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
|
|
| 118 |
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
|
| 119 |
}
|
| 120 |
|
| 121 |
+
static void print_device_opt_feature(int device_count) {
|
| 122 |
GGML_LOG_INFO("SYCL Optimization Feature:\n");
|
| 123 |
GGML_LOG_INFO(
|
| 124 |
"|ID| Device Type|Reorder|\n");
|
|
|
|
| 401 |
std::exit(1);
|
| 402 |
}
|
| 403 |
|
| 404 |
+
static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
| 405 |
const void *ptr_src, size_t size) {
|
| 406 |
char *host_buf = (char *)malloc(size);
|
| 407 |
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
|
|
|
|
| 620 |
return &ggml_backend_sycl_buffer_types[device];
|
| 621 |
}
|
| 622 |
|
| 623 |
+
static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
|
| 624 |
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
| 625 |
|
| 626 |
int device = ctx->device;
|
|
|
|
| 1682 |
|
| 1683 |
stream->parallel_for(
|
| 1684 |
sycl::nd_range<3>(num_blocks * block_size, block_size),
|
| 1685 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 1686 |
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
| 1687 |
});
|
| 1688 |
}
|
|
|
|
| 1703 |
|
| 1704 |
stream->parallel_for(
|
| 1705 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1706 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 1707 |
mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
|
| 1708 |
nchannels_y, item_ct1);
|
| 1709 |
});
|
|
|
|
| 1723 |
|
| 1724 |
stream->parallel_for(
|
| 1725 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1726 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 1727 |
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
|
| 1728 |
row_stride_x, channel_stride_x,
|
| 1729 |
nchannels_y / nchannels_x, item_ct1);
|
|
|
|
| 1764 |
const sycl::range<3> block_nums(1, nrows, 1);
|
| 1765 |
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1766 |
[=](sycl::nd_item<3> item_ct1)
|
| 1767 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 1768 |
k_sum_rows_f32(x, dst, ncols, item_ct1);
|
| 1769 |
});
|
| 1770 |
}
|
|
|
|
| 2920 |
return false;
|
| 2921 |
}
|
| 2922 |
|
| 2923 |
+
static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
| 2924 |
switch (type) {
|
| 2925 |
case GGML_TYPE_Q4_0:
|
| 2926 |
case GGML_TYPE_Q4_1:
|
|
|
|
| 3293 |
}
|
| 3294 |
|
| 3295 |
|
| 3296 |
+
static void ggml_sycl_set_main_device(const int main_device) try {
|
| 3297 |
if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
|
| 3298 |
return;
|
| 3299 |
}
|
|
|
|
| 3314 |
std::exit(1);
|
| 3315 |
}
|
| 3316 |
|
| 3317 |
+
static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
|
| 3318 |
if (!g_sycl_loaded) return false;
|
| 3319 |
|
| 3320 |
if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
|
|
|
|
| 3638 |
std::exit(1);
|
| 3639 |
}
|
| 3640 |
|
| 3641 |
+
static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
| 3642 |
size_t size, size_t offset, dpct::queue_ptr stream) {
|
| 3643 |
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
| 3644 |
SYCL_CHECK(
|
|
|
|
| 3652 |
|
| 3653 |
stream->parallel_for(
|
| 3654 |
size / sizeof(block_q4_0),
|
| 3655 |
+
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 3656 |
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
| 3657 |
const int ib = i;
|
| 3658 |
|
|
|
|
| 3666 |
sycl::free(tmp_buf, *stream);
|
| 3667 |
}
|
| 3668 |
|
| 3669 |
+
static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
|
| 3670 |
char*data_device = (char*)src0->data;
|
| 3671 |
size_t ncols = src0->ne[0];
|
| 3672 |
size_t nrows = src0->ne[1];
|
|
|
|
| 3675 |
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
| 3676 |
}
|
| 3677 |
|
| 3678 |
+
static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
|
| 3679 |
ggml_tensor *src0 = dst->src[0];
|
| 3680 |
ggml_tensor *src1 = dst->src[1];
|
| 3681 |
|
|
|
|
| 3688 |
}
|
| 3689 |
}
|
| 3690 |
|
| 3691 |
+
static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
|
| 3692 |
dpct::queue_ptr stream = ctx->stream();
|
| 3693 |
if (ctx->optimized_graph) {
|
| 3694 |
return;
|
|
|
|
| 3878 |
return true;
|
| 3879 |
}
|
| 3880 |
return false;
|
| 3881 |
+
}
|
| 3882 |
case GGML_OP_UNARY:
|
| 3883 |
switch (ggml_get_unary_op(op)) {
|
| 3884 |
case GGML_UNARY_OP_NEG:
|
|
|
|
| 3896 |
default:
|
| 3897 |
return false;
|
| 3898 |
}
|
|
|
|
| 3899 |
case GGML_OP_MUL_MAT:
|
| 3900 |
case GGML_OP_MUL_MAT_ID:
|
| 3901 |
{
|
|
|
|
| 3926 |
return false;
|
| 3927 |
}
|
| 3928 |
return true;
|
| 3929 |
+
}
|
| 3930 |
case GGML_OP_OUT_PROD:
|
| 3931 |
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
|
| 3932 |
case GGML_OP_GET_ROWS:
|
|
|
|
| 3943 |
default:
|
| 3944 |
return false;
|
| 3945 |
}
|
| 3946 |
+
}
|
| 3947 |
case GGML_OP_CPY:
|
| 3948 |
{
|
| 3949 |
ggml_type src0_type = op->src[0]->type;
|
|
|
|
| 3994 |
return true;
|
| 3995 |
}
|
| 3996 |
return false;
|
| 3997 |
+
}
|
| 3998 |
case GGML_OP_CONCAT:
|
| 3999 |
{
|
| 4000 |
ggml_type src0_type = op->src[0]->type;
|
| 4001 |
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
| 4002 |
+
}
|
| 4003 |
case GGML_OP_DUP:
|
| 4004 |
case GGML_OP_ARGMAX:
|
| 4005 |
case GGML_OP_NONE:
|
ggml/src/ggml-sycl/mmq.cpp
CHANGED
|
@@ -3017,7 +3017,6 @@ void ggml_sycl_op_mul_mat_q(
|
|
| 3017 |
break;
|
| 3018 |
default:
|
| 3019 |
GGML_ABORT("fatal error");
|
| 3020 |
-
break;
|
| 3021 |
}
|
| 3022 |
|
| 3023 |
GGML_UNUSED(src1);
|
|
|
|
| 3017 |
break;
|
| 3018 |
default:
|
| 3019 |
GGML_ABORT("fatal error");
|
|
|
|
| 3020 |
}
|
| 3021 |
|
| 3022 |
GGML_UNUSED(src1);
|
ggml/src/ggml-sycl/mmvq.cpp
CHANGED
|
@@ -495,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 495 |
cgh.parallel_for(
|
| 496 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 497 |
[=](sycl::nd_item<3> item_ct1)
|
| 498 |
-
[[
|
| 499 |
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
| 500 |
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
| 501 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -519,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
| 519 |
cgh.parallel_for(
|
| 520 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 521 |
[=](sycl::nd_item<3> item_ct1)
|
| 522 |
-
[[
|
| 523 |
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
| 524 |
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
| 525 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -543,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 543 |
cgh.parallel_for(
|
| 544 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 545 |
[=](sycl::nd_item<3> item_ct1)
|
| 546 |
-
[[
|
| 547 |
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
| 548 |
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
| 549 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -567,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
| 567 |
cgh.parallel_for(
|
| 568 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 569 |
[=](sycl::nd_item<3> item_ct1)
|
| 570 |
-
[[
|
| 571 |
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
| 572 |
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
| 573 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -591,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 591 |
cgh.parallel_for(
|
| 592 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 593 |
[=](sycl::nd_item<3> item_ct1)
|
| 594 |
-
[[
|
| 595 |
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
| 596 |
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
| 597 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -615,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 615 |
cgh.parallel_for(
|
| 616 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 617 |
[=](sycl::nd_item<3> item_ct1)
|
| 618 |
-
[[
|
| 619 |
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
| 620 |
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
| 621 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -639,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 639 |
cgh.parallel_for(
|
| 640 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 641 |
[=](sycl::nd_item<3> item_ct1)
|
| 642 |
-
[[
|
| 643 |
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
| 644 |
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
| 645 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -663,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 663 |
cgh.parallel_for(
|
| 664 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 665 |
[=](sycl::nd_item<3> item_ct1)
|
| 666 |
-
[[
|
| 667 |
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
| 668 |
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
| 669 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -687,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 687 |
cgh.parallel_for(
|
| 688 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 689 |
[=](sycl::nd_item<3> item_ct1)
|
| 690 |
-
[[
|
| 691 |
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
| 692 |
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
| 693 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -711,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 711 |
cgh.parallel_for(
|
| 712 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 713 |
[=](sycl::nd_item<3> item_ct1)
|
| 714 |
-
[[
|
| 715 |
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
| 716 |
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
| 717 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -734,7 +734,7 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 734 |
cgh.parallel_for(
|
| 735 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 736 |
[=](sycl::nd_item<3> item_ct1)
|
| 737 |
-
[[
|
| 738 |
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
| 739 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 740 |
});
|
|
@@ -755,7 +755,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 755 |
cgh.parallel_for(
|
| 756 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 757 |
[=](sycl::nd_item<3> item_ct1)
|
| 758 |
-
[[
|
| 759 |
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
| 760 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 761 |
});
|
|
@@ -777,7 +777,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 777 |
cgh.parallel_for(
|
| 778 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 779 |
[=](sycl::nd_item<3> item_ct1)
|
| 780 |
-
[[
|
| 781 |
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
| 782 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 783 |
});
|
|
@@ -799,7 +799,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 799 |
cgh.parallel_for(
|
| 800 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 801 |
[=](sycl::nd_item<3> item_ct1)
|
| 802 |
-
[[
|
| 803 |
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
| 804 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 805 |
});
|
|
@@ -821,7 +821,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 821 |
cgh.parallel_for(
|
| 822 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 823 |
[=](sycl::nd_item<3> item_ct1)
|
| 824 |
-
[[
|
| 825 |
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
| 826 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 827 |
});
|
|
@@ -843,7 +843,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 843 |
cgh.parallel_for(
|
| 844 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 845 |
[=](sycl::nd_item<3> item_ct1)
|
| 846 |
-
[[
|
| 847 |
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
| 848 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 849 |
});
|
|
@@ -864,7 +864,7 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
|
|
| 864 |
cgh.parallel_for(
|
| 865 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 866 |
[=](sycl::nd_item<3> item_ct1)
|
| 867 |
-
[[
|
| 868 |
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
| 869 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 870 |
});
|
|
@@ -886,7 +886,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
|
| 886 |
cgh.parallel_for(
|
| 887 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 888 |
[=](sycl::nd_item<3> item_ct1)
|
| 889 |
-
[[
|
| 890 |
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
| 891 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 892 |
});
|
|
@@ -908,7 +908,7 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 908 |
cgh.parallel_for(
|
| 909 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 910 |
[=](sycl::nd_item<3> item_ct1)
|
| 911 |
-
[[
|
| 912 |
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
| 913 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 914 |
});
|
|
@@ -1003,7 +1003,6 @@ void ggml_sycl_op_mul_mat_vec_q(
|
|
| 1003 |
break;
|
| 1004 |
default:
|
| 1005 |
GGML_ABORT("fatal error");
|
| 1006 |
-
break;
|
| 1007 |
}
|
| 1008 |
}
|
| 1009 |
GGML_UNUSED(src1);
|
|
|
|
| 495 |
cgh.parallel_for(
|
| 496 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 497 |
[=](sycl::nd_item<3> item_ct1)
|
| 498 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 499 |
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
| 500 |
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
| 501 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 519 |
cgh.parallel_for(
|
| 520 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 521 |
[=](sycl::nd_item<3> item_ct1)
|
| 522 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 523 |
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
| 524 |
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
| 525 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 543 |
cgh.parallel_for(
|
| 544 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 545 |
[=](sycl::nd_item<3> item_ct1)
|
| 546 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 547 |
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
| 548 |
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
| 549 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 567 |
cgh.parallel_for(
|
| 568 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 569 |
[=](sycl::nd_item<3> item_ct1)
|
| 570 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 571 |
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
| 572 |
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
| 573 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 591 |
cgh.parallel_for(
|
| 592 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 593 |
[=](sycl::nd_item<3> item_ct1)
|
| 594 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 595 |
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
| 596 |
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
| 597 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 615 |
cgh.parallel_for(
|
| 616 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 617 |
[=](sycl::nd_item<3> item_ct1)
|
| 618 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 619 |
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
| 620 |
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
| 621 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 639 |
cgh.parallel_for(
|
| 640 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 641 |
[=](sycl::nd_item<3> item_ct1)
|
| 642 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 643 |
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
| 644 |
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
| 645 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 663 |
cgh.parallel_for(
|
| 664 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 665 |
[=](sycl::nd_item<3> item_ct1)
|
| 666 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 667 |
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
| 668 |
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
| 669 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 687 |
cgh.parallel_for(
|
| 688 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 689 |
[=](sycl::nd_item<3> item_ct1)
|
| 690 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 691 |
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
| 692 |
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
| 693 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 711 |
cgh.parallel_for(
|
| 712 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 713 |
[=](sycl::nd_item<3> item_ct1)
|
| 714 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 715 |
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
| 716 |
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
| 717 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 734 |
cgh.parallel_for(
|
| 735 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 736 |
[=](sycl::nd_item<3> item_ct1)
|
| 737 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 738 |
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
| 739 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 740 |
});
|
|
|
|
| 755 |
cgh.parallel_for(
|
| 756 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 757 |
[=](sycl::nd_item<3> item_ct1)
|
| 758 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 759 |
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
| 760 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 761 |
});
|
|
|
|
| 777 |
cgh.parallel_for(
|
| 778 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 779 |
[=](sycl::nd_item<3> item_ct1)
|
| 780 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 781 |
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
| 782 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 783 |
});
|
|
|
|
| 799 |
cgh.parallel_for(
|
| 800 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 801 |
[=](sycl::nd_item<3> item_ct1)
|
| 802 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 803 |
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
| 804 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 805 |
});
|
|
|
|
| 821 |
cgh.parallel_for(
|
| 822 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 823 |
[=](sycl::nd_item<3> item_ct1)
|
| 824 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 825 |
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
| 826 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 827 |
});
|
|
|
|
| 843 |
cgh.parallel_for(
|
| 844 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 845 |
[=](sycl::nd_item<3> item_ct1)
|
| 846 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 847 |
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
| 848 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 849 |
});
|
|
|
|
| 864 |
cgh.parallel_for(
|
| 865 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 866 |
[=](sycl::nd_item<3> item_ct1)
|
| 867 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 868 |
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
| 869 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 870 |
});
|
|
|
|
| 886 |
cgh.parallel_for(
|
| 887 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 888 |
[=](sycl::nd_item<3> item_ct1)
|
| 889 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 890 |
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
| 891 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 892 |
});
|
|
|
|
| 908 |
cgh.parallel_for(
|
| 909 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 910 |
[=](sycl::nd_item<3> item_ct1)
|
| 911 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 912 |
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
| 913 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 914 |
});
|
|
|
|
| 1003 |
break;
|
| 1004 |
default:
|
| 1005 |
GGML_ABORT("fatal error");
|
|
|
|
| 1006 |
}
|
| 1007 |
}
|
| 1008 |
GGML_UNUSED(src1);
|
ggml/src/ggml-sycl/norm.cpp
CHANGED
|
@@ -235,7 +235,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
| 235 |
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 236 |
block_dims),
|
| 237 |
[=](sycl::nd_item<3> item_ct1)
|
| 238 |
-
[[
|
| 239 |
norm_f32(x, dst, ncols, eps, item_ct1,
|
| 240 |
nullptr, WARP_SIZE);
|
| 241 |
});
|
|
@@ -258,7 +258,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
| 258 |
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 259 |
block_dims),
|
| 260 |
[=](sycl::nd_item<3> item_ct1)
|
| 261 |
-
[[
|
| 262 |
norm_f32(x, dst, ncols, eps, item_ct1,
|
| 263 |
get_pointer(s_sum_acc_ct1), work_group_size);
|
| 264 |
});
|
|
@@ -277,7 +277,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
|
| 277 |
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
| 278 |
block_dims),
|
| 279 |
[=](sycl::nd_item<3> item_ct1)
|
| 280 |
-
[[
|
| 281 |
group_norm_f32(
|
| 282 |
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
| 283 |
nullptr, WARP_SIZE);
|
|
@@ -304,7 +304,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
|
| 304 |
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
| 305 |
block_dims),
|
| 306 |
[=](sycl::nd_item<3> item_ct1)
|
| 307 |
-
[[
|
| 308 |
group_norm_f32(x, dst, group_size, ne_elements,
|
| 309 |
eps_ct4, item_ct1,
|
| 310 |
get_pointer(s_sum_acc_ct1), work_group_size);
|
|
@@ -325,7 +325,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
| 325 |
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 326 |
block_dims),
|
| 327 |
[=](sycl::nd_item<3> item_ct1)
|
| 328 |
-
[[
|
| 329 |
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
| 330 |
nullptr, WARP_SIZE);
|
| 331 |
});
|
|
@@ -347,7 +347,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
| 347 |
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 348 |
block_dims),
|
| 349 |
[=](sycl::nd_item<3> item_ct1)
|
| 350 |
-
[[
|
| 351 |
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
| 352 |
get_pointer(s_sum_acc_ct1), work_group_size);
|
| 353 |
});
|
|
|
|
| 235 |
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 236 |
block_dims),
|
| 237 |
[=](sycl::nd_item<3> item_ct1)
|
| 238 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 239 |
norm_f32(x, dst, ncols, eps, item_ct1,
|
| 240 |
nullptr, WARP_SIZE);
|
| 241 |
});
|
|
|
|
| 258 |
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 259 |
block_dims),
|
| 260 |
[=](sycl::nd_item<3> item_ct1)
|
| 261 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 262 |
norm_f32(x, dst, ncols, eps, item_ct1,
|
| 263 |
get_pointer(s_sum_acc_ct1), work_group_size);
|
| 264 |
});
|
|
|
|
| 277 |
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
| 278 |
block_dims),
|
| 279 |
[=](sycl::nd_item<3> item_ct1)
|
| 280 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 281 |
group_norm_f32(
|
| 282 |
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
| 283 |
nullptr, WARP_SIZE);
|
|
|
|
| 304 |
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
| 305 |
block_dims),
|
| 306 |
[=](sycl::nd_item<3> item_ct1)
|
| 307 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 308 |
group_norm_f32(x, dst, group_size, ne_elements,
|
| 309 |
eps_ct4, item_ct1,
|
| 310 |
get_pointer(s_sum_acc_ct1), work_group_size);
|
|
|
|
| 325 |
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 326 |
block_dims),
|
| 327 |
[=](sycl::nd_item<3> item_ct1)
|
| 328 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 329 |
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
| 330 |
nullptr, WARP_SIZE);
|
| 331 |
});
|
|
|
|
| 347 |
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 348 |
block_dims),
|
| 349 |
[=](sycl::nd_item<3> item_ct1)
|
| 350 |
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 351 |
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
| 352 |
get_pointer(s_sum_acc_ct1), work_group_size);
|
| 353 |
});
|
ggml/src/ggml-sycl/softmax.cpp
CHANGED
|
@@ -132,7 +132,7 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst,
|
|
| 132 |
|
| 133 |
cgh.parallel_for(
|
| 134 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 135 |
-
[=](sycl::nd_item<3> item_ct1) [[
|
| 136 |
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
| 137 |
nrows_y, scale, max_bias, m0,
|
| 138 |
m1, n_head_log2, item_ct1,
|
|
|
|
| 132 |
|
| 133 |
cgh.parallel_for(
|
| 134 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 135 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 136 |
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
| 137 |
nrows_y, scale, max_bias, m0,
|
| 138 |
m1, n_head_log2, item_ct1,
|