hengyu commited on
Commit
acf23d9
·
1 Parent(s): e2989d0

add concat through dim 1/2 (llama/8483)

Browse files
ggml/src/ggml-sycl.cpp CHANGED
@@ -291,29 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
291
  dst[i] = x[i] * x[i];
292
  }
293
 
294
- static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
295
- const sycl::nd_item<3> &item_ct1) {
296
- int nidx = item_ct1.get_local_id(2) +
297
- item_ct1.get_group(2) * item_ct1.get_local_range(2);
298
- if (nidx >= ne0) {
299
- return;
300
- }
301
- // operation
302
- int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
303
- item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
304
- if (item_ct1.get_group(0) < ne02) { // src0
305
- int offset_src =
306
- nidx + item_ct1.get_group(1) * ne0 +
307
- item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
308
- dst[offset_dst] = x[offset_src];
309
- } else {
310
- int offset_src =
311
- nidx + item_ct1.get_group(1) * ne0 +
312
- (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
313
- dst[offset_dst] = y[offset_src];
314
- }
315
- }
316
-
317
  static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
318
  const int nb02, const int nb03, const int ne10, const int ne11,
319
  const int ne12, const int ne13, const float sf0, const float sf1,
@@ -1347,20 +1324,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
1347
  });
1348
  }
1349
 
1350
- static void concat_f32_sycl(const float *x, const float *y, float *dst,
1351
- const int ne0, int ne1, int ne2, int ne02,
1352
- queue_ptr stream) {
1353
- int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
1354
- sycl::range<3> gridDim(ne2, ne1, num_blocks);
1355
- stream->parallel_for(
1356
- sycl::nd_range<3>(gridDim *
1357
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
1358
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
1359
- [=](sycl::nd_item<3> item_ct1) {
1360
- concat_f32(x, y, dst, ne0, ne02, item_ct1);
1361
- });
1362
- }
1363
-
1364
  static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
1365
  const int nb02, const int nb03, const int ne10, const int ne11,
1366
  const int ne12, const int ne13, const float sf0, const float sf1,
@@ -2429,28 +2392,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
2429
  (void) src1_dd;
2430
  }
2431
 
2432
- inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2433
- const ggml_tensor *src1, ggml_tensor *dst,
2434
- const float *src0_dd, const float *src1_dd,
2435
- float *dst_dd,
2436
- const queue_ptr &main_stream) {
2437
- #pragma message("TODO: generalize concat kernel for dim != 2")
2438
- #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
2439
- int dim = dst->op_params[0];
2440
- GGML_ASSERT(dim == 2);
2441
-
2442
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
2443
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
2444
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
2445
-
2446
- for (int i3 = 0; i3 < dst->ne[3]; i3++) {
2447
- concat_f32_sycl(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
2448
- }
2449
-
2450
- (void) src1;
2451
- (void) dst;
2452
- }
2453
-
2454
  inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2455
  const ggml_tensor *src1, ggml_tensor *dst,
2456
  const float *src0_dd, const float *src1_dd,
@@ -3359,12 +3300,6 @@ static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_ten
3359
  GGML_SYCL_DEBUG("call %s done\n", __func__);
3360
  }
3361
 
3362
- static void ggml_sycl_concat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3363
- GGML_SYCL_DEBUG("call %s\n", __func__);
3364
- ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_concat);
3365
- GGML_SYCL_DEBUG("call %s done\n", __func__);
3366
- }
3367
-
3368
  static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3369
  GGML_SYCL_DEBUG("call %s\n", __func__);
3370
  ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
@@ -4101,7 +4036,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
4101
  func = ggml_sycl_group_norm;
4102
  break;
4103
  case GGML_OP_CONCAT:
4104
- func = ggml_sycl_concat;
4105
  break;
4106
  case GGML_OP_UPSCALE:
4107
  func = ggml_sycl_upscale;
 
291
  dst[i] = x[i] * x[i];
292
  }
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
295
  const int nb02, const int nb03, const int ne10, const int ne11,
296
  const int ne12, const int ne13, const float sf0, const float sf1,
 
1324
  });
1325
  }
1326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1327
  static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
1328
  const int nb02, const int nb03, const int ne10, const int ne11,
1329
  const int ne12, const int ne13, const float sf0, const float sf1,
 
2392
  (void) src1_dd;
2393
  }
2394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2395
  inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2396
  const ggml_tensor *src1, ggml_tensor *dst,
2397
  const float *src0_dd, const float *src1_dd,
 
3300
  GGML_SYCL_DEBUG("call %s done\n", __func__);
3301
  }
3302
 
 
 
 
 
 
 
3303
  static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3304
  GGML_SYCL_DEBUG("call %s\n", __func__);
3305
  ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
 
4036
  func = ggml_sycl_group_norm;
4037
  break;
4038
  case GGML_OP_CONCAT:
4039
+ func = ggml_sycl_op_concat;
4040
  break;
4041
  case GGML_OP_UPSCALE:
4042
  func = ggml_sycl_upscale;
ggml/src/ggml-sycl/backend.hpp CHANGED
@@ -13,6 +13,7 @@
13
  #ifndef GGML_SYCL_BACKEND_HPP
14
  #define GGML_SYCL_BACKEND_HPP
15
 
 
16
  #include "common.hpp"
17
  #include "convert.hpp"
18
  #include "dequantize.hpp"
 
13
  #ifndef GGML_SYCL_BACKEND_HPP
14
  #define GGML_SYCL_BACKEND_HPP
15
 
16
+ #include "concat.hpp"
17
  #include "common.hpp"
18
  #include "convert.hpp"
19
  #include "dequantize.hpp"
ggml/src/ggml-sycl/concat.cpp ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2024 Intel Corporation
4
+ // SPDX-License-Identifier: MIT
5
+ //
6
+
7
+ //
8
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9
+ // See https://llvm.org/LICENSE.txt for license information.
10
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11
+ //
12
+
13
+ #include "concat.hpp"
14
+ #include "common.hpp"
15
+
16
+ static void concat_f32_dim0(const float *x, const float *y, float *dst,
17
+ const int ne0, const int ne00,
18
+ const sycl::nd_item<3> &item_ct1) {
19
+ int nidx = item_ct1.get_local_id(2) +
20
+ item_ct1.get_group(2) * item_ct1.get_local_range(2);
21
+ if (nidx >= ne0) {
22
+ return;
23
+ }
24
+ // operation
25
+ int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
26
+ item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
27
+ if (nidx < ne00) { // src0
28
+ int offset_src = nidx + item_ct1.get_group(1) * ne00 +
29
+ item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1);
30
+ dst[offset_dst] = x[offset_src];
31
+ } else {
32
+ int offset_src =
33
+ nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) +
34
+ item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1);
35
+ dst[offset_dst] = y[offset_src];
36
+ }
37
+ }
38
+
39
+ static void concat_f32_dim1(const float *x, const float *y, float *dst,
40
+ const int ne0, const int ne01,
41
+ const sycl::nd_item<3> &item_ct1) {
42
+ int nidx = item_ct1.get_local_id(2) +
43
+ item_ct1.get_group(2) * item_ct1.get_local_range(2);
44
+ if (nidx >= ne0) {
45
+ return;
46
+ }
47
+ // operation
48
+ int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
49
+ item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
50
+ if (item_ct1.get_group(1) < ne01) { // src0
51
+ int offset_src =
52
+ nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
53
+ dst[offset_dst] = x[offset_src];
54
+ } else {
55
+ int offset_src =
56
+ nidx + (item_ct1.get_group(1) - ne01) * ne0 +
57
+ item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01);
58
+ dst[offset_dst] = y[offset_src];
59
+ }
60
+ }
61
+
62
+ static void concat_f32_dim2(const float *x, const float *y, float *dst,
63
+ const int ne0, const int ne02,
64
+ const sycl::nd_item<3> &item_ct1) {
65
+ int nidx = item_ct1.get_local_id(2) +
66
+ item_ct1.get_group(2) * item_ct1.get_local_range(2);
67
+ if (nidx >= ne0) {
68
+ return;
69
+ }
70
+ // operation
71
+ int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
72
+ item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
73
+ if (item_ct1.get_group(0) < ne02) { // src0
74
+ int offset_src = nidx + item_ct1.get_group(1) * ne0 +
75
+ item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
76
+ dst[offset_dst] = x[offset_src];
77
+ } else {
78
+ int offset_src =
79
+ nidx + item_ct1.get_group(1) * ne0 +
80
+ (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
81
+ dst[offset_dst] = y[offset_src];
82
+ }
83
+ }
84
+
85
+ static void concat_f32_sycl(const float *x, const float *y, float *dst,
86
+ int ne00, int ne01, int ne02, int ne0, int ne1,
87
+ int ne2, int dim, queue_ptr stream) {
88
+ int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
89
+ sycl::range<3> gridDim(ne2, ne1, num_blocks);
90
+ switch (dim) {
91
+ case 0:
92
+ stream->parallel_for(
93
+ sycl::nd_range<3>(gridDim *
94
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
95
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
96
+ [=](sycl::nd_item<3> item_ct1) {
97
+ concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
98
+ });
99
+ break;
100
+ case 1:
101
+ stream->parallel_for(
102
+ sycl::nd_range<3>(gridDim *
103
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
104
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
105
+ [=](sycl::nd_item<3> item_ct1) {
106
+ concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
107
+ });
108
+ break;
109
+ default:
110
+ stream->parallel_for(
111
+ sycl::nd_range<3>(gridDim *
112
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
113
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
114
+ [=](sycl::nd_item<3> item_ct1) {
115
+ concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
116
+ });
117
+ break;
118
+ }
119
+ }
120
+
121
+ // non-contiguous kernel (slow)
122
+ static void concat_f32_sycl_non_cont(
123
+ queue_ptr stream, const char *src0, const char *src1, char *dst,
124
+ int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
125
+ uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
126
+ int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10,
127
+ uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1,
128
+ int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
129
+ uint64_t nb3, int32_t dim) {
130
+ sycl::range<3> gridDim(ne3, ne2, ne1);
131
+ stream->parallel_for(
132
+ sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)),
133
+ [=](sycl::nd_item<3> item_ct1) {
134
+ int64_t i3 = item_ct1.get_group(0);
135
+ int64_t i2 = item_ct1.get_group(1);
136
+ int64_t i1 = item_ct1.get_group(2);
137
+
138
+ int64_t o[4] = {0, 0, 0, 0};
139
+ o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
140
+
141
+ const float *x;
142
+
143
+ for (int i0 = item_ct1.get_local_id(2); i0 < ne0;
144
+ i0 += item_ct1.get_local_range(2)) {
145
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
146
+ x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 +
147
+ (i0)*nb00);
148
+ } else {
149
+ x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 +
150
+ (i1 - o[1]) * nb11 + (i0 - o[0]) * nb10);
151
+ }
152
+
153
+ float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
154
+
155
+ *y = *x;
156
+ }
157
+ });
158
+ }
159
+
160
+ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
161
+ const ggml_tensor *src1, ggml_tensor *dst) {
162
+ queue_ptr stream = ctx.stream();
163
+
164
+ const int32_t dim = ((int32_t *)dst->op_params)[0];
165
+
166
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
167
+ const float *src0_d = (const float *)src0->data;
168
+ const float *src1_d = (const float *)src1->data;
169
+
170
+ float *dst_d = (float *)dst->data;
171
+
172
+ if (dim != 3) {
173
+ for (int i3 = 0; i3 < dst->ne[3]; i3++) {
174
+ concat_f32_sycl(
175
+ src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
176
+ dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
177
+ src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
178
+ }
179
+ } else {
180
+ const size_t size0 = ggml_nbytes(src0);
181
+ const size_t size1 = ggml_nbytes(src1);
182
+
183
+ SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
184
+ SYCL_CHECK(CHECK_TRY_ERROR(
185
+ stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
186
+ }
187
+ } else
188
+ concat_f32_sycl_non_cont(
189
+ stream, (const char *)src0->data, (const char *)src1->data,
190
+ (char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
191
+ src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
192
+ src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
193
+ src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
194
+ dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
195
+ }
ggml/src/ggml-sycl/concat.hpp ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2024 Intel Corporation
4
+ // SPDX-License-Identifier: MIT
5
+ //
6
+
7
+ //
8
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9
+ // See https://llvm.org/LICENSE.txt for license information.
10
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11
+ //
12
+
13
+ #ifndef GGML_SYCL_CONCAT_HPP
14
+ #define GGML_SYCL_CONCAT_HPP
15
+
16
+ #include "common.hpp"
17
+
18
+ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
19
+ const ggml_tensor *src1, ggml_tensor *dst);
20
+
21
+ #endif // GGML_SYCL_CONCAT_HPP