Spaces:
Running
Running
add concat through dim 1/2 (llama/8483)
Browse files- ggml/src/ggml-sycl.cpp +1 -66
- ggml/src/ggml-sycl/backend.hpp +1 -0
- ggml/src/ggml-sycl/concat.cpp +195 -0
- ggml/src/ggml-sycl/concat.hpp +21 -0
ggml/src/ggml-sycl.cpp
CHANGED
|
@@ -291,29 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
|
|
| 291 |
dst[i] = x[i] * x[i];
|
| 292 |
}
|
| 293 |
|
| 294 |
-
static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
|
| 295 |
-
const sycl::nd_item<3> &item_ct1) {
|
| 296 |
-
int nidx = item_ct1.get_local_id(2) +
|
| 297 |
-
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
| 298 |
-
if (nidx >= ne0) {
|
| 299 |
-
return;
|
| 300 |
-
}
|
| 301 |
-
// operation
|
| 302 |
-
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
| 303 |
-
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 304 |
-
if (item_ct1.get_group(0) < ne02) { // src0
|
| 305 |
-
int offset_src =
|
| 306 |
-
nidx + item_ct1.get_group(1) * ne0 +
|
| 307 |
-
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 308 |
-
dst[offset_dst] = x[offset_src];
|
| 309 |
-
} else {
|
| 310 |
-
int offset_src =
|
| 311 |
-
nidx + item_ct1.get_group(1) * ne0 +
|
| 312 |
-
(item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
|
| 313 |
-
dst[offset_dst] = y[offset_src];
|
| 314 |
-
}
|
| 315 |
-
}
|
| 316 |
-
|
| 317 |
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
| 318 |
const int nb02, const int nb03, const int ne10, const int ne11,
|
| 319 |
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
@@ -1347,20 +1324,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
|
|
| 1347 |
});
|
| 1348 |
}
|
| 1349 |
|
| 1350 |
-
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
| 1351 |
-
const int ne0, int ne1, int ne2, int ne02,
|
| 1352 |
-
queue_ptr stream) {
|
| 1353 |
-
int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
|
| 1354 |
-
sycl::range<3> gridDim(ne2, ne1, num_blocks);
|
| 1355 |
-
stream->parallel_for(
|
| 1356 |
-
sycl::nd_range<3>(gridDim *
|
| 1357 |
-
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
| 1358 |
-
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
| 1359 |
-
[=](sycl::nd_item<3> item_ct1) {
|
| 1360 |
-
concat_f32(x, y, dst, ne0, ne02, item_ct1);
|
| 1361 |
-
});
|
| 1362 |
-
}
|
| 1363 |
-
|
| 1364 |
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
| 1365 |
const int nb02, const int nb03, const int ne10, const int ne11,
|
| 1366 |
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
@@ -2429,28 +2392,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 2429 |
(void) src1_dd;
|
| 2430 |
}
|
| 2431 |
|
| 2432 |
-
inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 2433 |
-
const ggml_tensor *src1, ggml_tensor *dst,
|
| 2434 |
-
const float *src0_dd, const float *src1_dd,
|
| 2435 |
-
float *dst_dd,
|
| 2436 |
-
const queue_ptr &main_stream) {
|
| 2437 |
-
#pragma message("TODO: generalize concat kernel for dim != 2")
|
| 2438 |
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
|
| 2439 |
-
int dim = dst->op_params[0];
|
| 2440 |
-
GGML_ASSERT(dim == 2);
|
| 2441 |
-
|
| 2442 |
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 2443 |
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 2444 |
-
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 2445 |
-
|
| 2446 |
-
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
| 2447 |
-
concat_f32_sycl(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
|
| 2448 |
-
}
|
| 2449 |
-
|
| 2450 |
-
(void) src1;
|
| 2451 |
-
(void) dst;
|
| 2452 |
-
}
|
| 2453 |
-
|
| 2454 |
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 2455 |
const ggml_tensor *src1, ggml_tensor *dst,
|
| 2456 |
const float *src0_dd, const float *src1_dd,
|
|
@@ -3359,12 +3300,6 @@ static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
| 3359 |
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
| 3360 |
}
|
| 3361 |
|
| 3362 |
-
static void ggml_sycl_concat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3363 |
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
| 3364 |
-
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_concat);
|
| 3365 |
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
| 3366 |
-
}
|
| 3367 |
-
|
| 3368 |
static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3369 |
GGML_SYCL_DEBUG("call %s\n", __func__);
|
| 3370 |
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
|
|
@@ -4101,7 +4036,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
|
| 4101 |
func = ggml_sycl_group_norm;
|
| 4102 |
break;
|
| 4103 |
case GGML_OP_CONCAT:
|
| 4104 |
-
func =
|
| 4105 |
break;
|
| 4106 |
case GGML_OP_UPSCALE:
|
| 4107 |
func = ggml_sycl_upscale;
|
|
|
|
| 291 |
dst[i] = x[i] * x[i];
|
| 292 |
}
|
| 293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
| 295 |
const int nb02, const int nb03, const int ne10, const int ne11,
|
| 296 |
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
|
|
| 1324 |
});
|
| 1325 |
}
|
| 1326 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1327 |
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
| 1328 |
const int nb02, const int nb03, const int ne10, const int ne11,
|
| 1329 |
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
|
|
| 2392 |
(void) src1_dd;
|
| 2393 |
}
|
| 2394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2395 |
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 2396 |
const ggml_tensor *src1, ggml_tensor *dst,
|
| 2397 |
const float *src0_dd, const float *src1_dd,
|
|
|
|
| 3300 |
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
| 3301 |
}
|
| 3302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3303 |
static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3304 |
GGML_SYCL_DEBUG("call %s\n", __func__);
|
| 3305 |
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
|
|
|
|
| 4036 |
func = ggml_sycl_group_norm;
|
| 4037 |
break;
|
| 4038 |
case GGML_OP_CONCAT:
|
| 4039 |
+
func = ggml_sycl_op_concat;
|
| 4040 |
break;
|
| 4041 |
case GGML_OP_UPSCALE:
|
| 4042 |
func = ggml_sycl_upscale;
|
ggml/src/ggml-sycl/backend.hpp
CHANGED
|
@@ -13,6 +13,7 @@
|
|
| 13 |
#ifndef GGML_SYCL_BACKEND_HPP
|
| 14 |
#define GGML_SYCL_BACKEND_HPP
|
| 15 |
|
|
|
|
| 16 |
#include "common.hpp"
|
| 17 |
#include "convert.hpp"
|
| 18 |
#include "dequantize.hpp"
|
|
|
|
| 13 |
#ifndef GGML_SYCL_BACKEND_HPP
|
| 14 |
#define GGML_SYCL_BACKEND_HPP
|
| 15 |
|
| 16 |
+
#include "concat.hpp"
|
| 17 |
#include "common.hpp"
|
| 18 |
#include "convert.hpp"
|
| 19 |
#include "dequantize.hpp"
|
ggml/src/ggml-sycl/concat.cpp
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// MIT license
|
| 3 |
+
// Copyright (C) 2024 Intel Corporation
|
| 4 |
+
// SPDX-License-Identifier: MIT
|
| 5 |
+
//
|
| 6 |
+
|
| 7 |
+
//
|
| 8 |
+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
| 9 |
+
// See https://llvm.org/LICENSE.txt for license information.
|
| 10 |
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
| 11 |
+
//
|
| 12 |
+
|
| 13 |
+
#include "concat.hpp"
|
| 14 |
+
#include "common.hpp"
|
| 15 |
+
|
| 16 |
+
static void concat_f32_dim0(const float *x, const float *y, float *dst,
|
| 17 |
+
const int ne0, const int ne00,
|
| 18 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 19 |
+
int nidx = item_ct1.get_local_id(2) +
|
| 20 |
+
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
| 21 |
+
if (nidx >= ne0) {
|
| 22 |
+
return;
|
| 23 |
+
}
|
| 24 |
+
// operation
|
| 25 |
+
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
| 26 |
+
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 27 |
+
if (nidx < ne00) { // src0
|
| 28 |
+
int offset_src = nidx + item_ct1.get_group(1) * ne00 +
|
| 29 |
+
item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1);
|
| 30 |
+
dst[offset_dst] = x[offset_src];
|
| 31 |
+
} else {
|
| 32 |
+
int offset_src =
|
| 33 |
+
nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) +
|
| 34 |
+
item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1);
|
| 35 |
+
dst[offset_dst] = y[offset_src];
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
static void concat_f32_dim1(const float *x, const float *y, float *dst,
|
| 40 |
+
const int ne0, const int ne01,
|
| 41 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 42 |
+
int nidx = item_ct1.get_local_id(2) +
|
| 43 |
+
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
| 44 |
+
if (nidx >= ne0) {
|
| 45 |
+
return;
|
| 46 |
+
}
|
| 47 |
+
// operation
|
| 48 |
+
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
| 49 |
+
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 50 |
+
if (item_ct1.get_group(1) < ne01) { // src0
|
| 51 |
+
int offset_src =
|
| 52 |
+
nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
|
| 53 |
+
dst[offset_dst] = x[offset_src];
|
| 54 |
+
} else {
|
| 55 |
+
int offset_src =
|
| 56 |
+
nidx + (item_ct1.get_group(1) - ne01) * ne0 +
|
| 57 |
+
item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01);
|
| 58 |
+
dst[offset_dst] = y[offset_src];
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
static void concat_f32_dim2(const float *x, const float *y, float *dst,
|
| 63 |
+
const int ne0, const int ne02,
|
| 64 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 65 |
+
int nidx = item_ct1.get_local_id(2) +
|
| 66 |
+
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
| 67 |
+
if (nidx >= ne0) {
|
| 68 |
+
return;
|
| 69 |
+
}
|
| 70 |
+
// operation
|
| 71 |
+
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
| 72 |
+
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 73 |
+
if (item_ct1.get_group(0) < ne02) { // src0
|
| 74 |
+
int offset_src = nidx + item_ct1.get_group(1) * ne0 +
|
| 75 |
+
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
| 76 |
+
dst[offset_dst] = x[offset_src];
|
| 77 |
+
} else {
|
| 78 |
+
int offset_src =
|
| 79 |
+
nidx + item_ct1.get_group(1) * ne0 +
|
| 80 |
+
(item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
|
| 81 |
+
dst[offset_dst] = y[offset_src];
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
| 86 |
+
int ne00, int ne01, int ne02, int ne0, int ne1,
|
| 87 |
+
int ne2, int dim, queue_ptr stream) {
|
| 88 |
+
int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
|
| 89 |
+
sycl::range<3> gridDim(ne2, ne1, num_blocks);
|
| 90 |
+
switch (dim) {
|
| 91 |
+
case 0:
|
| 92 |
+
stream->parallel_for(
|
| 93 |
+
sycl::nd_range<3>(gridDim *
|
| 94 |
+
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
| 95 |
+
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
| 96 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 97 |
+
concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
|
| 98 |
+
});
|
| 99 |
+
break;
|
| 100 |
+
case 1:
|
| 101 |
+
stream->parallel_for(
|
| 102 |
+
sycl::nd_range<3>(gridDim *
|
| 103 |
+
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
| 104 |
+
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
| 105 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 106 |
+
concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
|
| 107 |
+
});
|
| 108 |
+
break;
|
| 109 |
+
default:
|
| 110 |
+
stream->parallel_for(
|
| 111 |
+
sycl::nd_range<3>(gridDim *
|
| 112 |
+
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
| 113 |
+
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
| 114 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 115 |
+
concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
|
| 116 |
+
});
|
| 117 |
+
break;
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
// non-contiguous kernel (slow)
|
| 122 |
+
static void concat_f32_sycl_non_cont(
|
| 123 |
+
queue_ptr stream, const char *src0, const char *src1, char *dst,
|
| 124 |
+
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
|
| 125 |
+
uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
|
| 126 |
+
int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10,
|
| 127 |
+
uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1,
|
| 128 |
+
int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
|
| 129 |
+
uint64_t nb3, int32_t dim) {
|
| 130 |
+
sycl::range<3> gridDim(ne3, ne2, ne1);
|
| 131 |
+
stream->parallel_for(
|
| 132 |
+
sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)),
|
| 133 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 134 |
+
int64_t i3 = item_ct1.get_group(0);
|
| 135 |
+
int64_t i2 = item_ct1.get_group(1);
|
| 136 |
+
int64_t i1 = item_ct1.get_group(2);
|
| 137 |
+
|
| 138 |
+
int64_t o[4] = {0, 0, 0, 0};
|
| 139 |
+
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
| 140 |
+
|
| 141 |
+
const float *x;
|
| 142 |
+
|
| 143 |
+
for (int i0 = item_ct1.get_local_id(2); i0 < ne0;
|
| 144 |
+
i0 += item_ct1.get_local_range(2)) {
|
| 145 |
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
| 146 |
+
x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 +
|
| 147 |
+
(i0)*nb00);
|
| 148 |
+
} else {
|
| 149 |
+
x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 +
|
| 150 |
+
(i1 - o[1]) * nb11 + (i0 - o[0]) * nb10);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
|
| 154 |
+
|
| 155 |
+
*y = *x;
|
| 156 |
+
}
|
| 157 |
+
});
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 161 |
+
const ggml_tensor *src1, ggml_tensor *dst) {
|
| 162 |
+
queue_ptr stream = ctx.stream();
|
| 163 |
+
|
| 164 |
+
const int32_t dim = ((int32_t *)dst->op_params)[0];
|
| 165 |
+
|
| 166 |
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
| 167 |
+
const float *src0_d = (const float *)src0->data;
|
| 168 |
+
const float *src1_d = (const float *)src1->data;
|
| 169 |
+
|
| 170 |
+
float *dst_d = (float *)dst->data;
|
| 171 |
+
|
| 172 |
+
if (dim != 3) {
|
| 173 |
+
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
| 174 |
+
concat_f32_sycl(
|
| 175 |
+
src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
|
| 176 |
+
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
|
| 177 |
+
src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
| 178 |
+
}
|
| 179 |
+
} else {
|
| 180 |
+
const size_t size0 = ggml_nbytes(src0);
|
| 181 |
+
const size_t size1 = ggml_nbytes(src1);
|
| 182 |
+
|
| 183 |
+
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
|
| 184 |
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
| 185 |
+
stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
|
| 186 |
+
}
|
| 187 |
+
} else
|
| 188 |
+
concat_f32_sycl_non_cont(
|
| 189 |
+
stream, (const char *)src0->data, (const char *)src1->data,
|
| 190 |
+
(char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
| 191 |
+
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
|
| 192 |
+
src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
|
| 193 |
+
src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
|
| 194 |
+
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
| 195 |
+
}
|
ggml/src/ggml-sycl/concat.hpp
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// MIT license
|
| 3 |
+
// Copyright (C) 2024 Intel Corporation
|
| 4 |
+
// SPDX-License-Identifier: MIT
|
| 5 |
+
//
|
| 6 |
+
|
| 7 |
+
//
|
| 8 |
+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
| 9 |
+
// See https://llvm.org/LICENSE.txt for license information.
|
| 10 |
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
| 11 |
+
//
|
| 12 |
+
|
| 13 |
+
#ifndef GGML_SYCL_CONCAT_HPP
|
| 14 |
+
#define GGML_SYCL_CONCAT_HPP
|
| 15 |
+
|
| 16 |
+
#include "common.hpp"
|
| 17 |
+
|
| 18 |
+
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 19 |
+
const ggml_tensor *src1, ggml_tensor *dst);
|
| 20 |
+
|
| 21 |
+
#endif // GGML_SYCL_CONCAT_HPP
|