Spaces:
Sleeping
Sleeping
Fix the sub group size of Intel (llama/8106)
Browse files* use warp_size macro for all sycl kernels
* fix mask of permute_sub_group_by_xor
* fix rms_norm with correct warp number
* fix rms_norm_f32/group_norm_f32
* move norm to norm.cpp file
* fix quantize bug
* fix mmvq's batch size
- ggml/src/CMakeLists.txt +3 -1
- ggml/src/ggml-sycl.cpp +42 -430
ggml/src/CMakeLists.txt
CHANGED
|
@@ -486,9 +486,11 @@ if (GGML_SYCL)
|
|
| 486 |
add_compile_options(-I./) #include DPCT
|
| 487 |
|
| 488 |
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
| 489 |
-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
| 490 |
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
| 491 |
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
|
|
|
|
|
|
|
|
|
| 492 |
endif()
|
| 493 |
|
| 494 |
file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
|
|
|
|
| 486 |
add_compile_options(-I./) #include DPCT
|
| 487 |
|
| 488 |
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
|
|
|
| 489 |
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
| 490 |
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
| 491 |
+
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
| 492 |
+
else()
|
| 493 |
+
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
| 494 |
endif()
|
| 495 |
|
| 496 |
file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
|
ggml/src/ggml-sycl.cpp
CHANGED
|
@@ -74,51 +74,6 @@ typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const gg
|
|
| 74 |
const float *src1_dd, float *dst_dd,
|
| 75 |
const queue_ptr &main_stream);
|
| 76 |
|
| 77 |
-
static __dpct_inline__ float warp_reduce_sum(float x,
|
| 78 |
-
const sycl::nd_item<3> &item_ct1) {
|
| 79 |
-
#pragma unroll
|
| 80 |
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 81 |
-
/*
|
| 82 |
-
DPCT1096:98: The right-most dimension of the work-group used in the SYCL
|
| 83 |
-
kernel that calls this function may be less than "32". The function
|
| 84 |
-
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
| 85 |
-
CPU device. Modify the size of the work-group to ensure that the value
|
| 86 |
-
of the right-most dimension is a multiple of "32".
|
| 87 |
-
*/
|
| 88 |
-
x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
|
| 89 |
-
}
|
| 90 |
-
return x;
|
| 91 |
-
}
|
| 92 |
-
|
| 93 |
-
static __dpct_inline__ sycl::float2
|
| 94 |
-
warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
|
| 95 |
-
#pragma unroll
|
| 96 |
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 97 |
-
a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
|
| 98 |
-
mask);
|
| 99 |
-
a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
|
| 100 |
-
mask);
|
| 101 |
-
}
|
| 102 |
-
return a;
|
| 103 |
-
}
|
| 104 |
-
|
| 105 |
-
static __dpct_inline__ float warp_reduce_max(float x,
|
| 106 |
-
const sycl::nd_item<3> &item_ct1) {
|
| 107 |
-
#pragma unroll
|
| 108 |
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 109 |
-
/*
|
| 110 |
-
DPCT1096:97: The right-most dimension of the work-group used in the SYCL
|
| 111 |
-
kernel that calls this function may be less than "32". The function
|
| 112 |
-
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
| 113 |
-
CPU device. Modify the size of the work-group to ensure that the value
|
| 114 |
-
of the right-most dimension is a multiple of "32".
|
| 115 |
-
*/
|
| 116 |
-
x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
|
| 117 |
-
item_ct1.get_sub_group(), x, mask));
|
| 118 |
-
}
|
| 119 |
-
return x;
|
| 120 |
-
}
|
| 121 |
-
|
| 122 |
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
| 123 |
return b;
|
| 124 |
GGML_UNUSED(a);
|
|
@@ -336,47 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
|
|
| 336 |
dst[i] = x[i] * x[i];
|
| 337 |
}
|
| 338 |
|
| 339 |
-
static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
|
| 340 |
-
const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum, int block_size) {
|
| 341 |
-
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 342 |
-
item_ct1.get_local_id(1);
|
| 343 |
-
const int tid = item_ct1.get_local_id(2);
|
| 344 |
-
|
| 345 |
-
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
| 346 |
-
|
| 347 |
-
for (int col = tid; col < ncols; col += block_size) {
|
| 348 |
-
const float xi = x[row*ncols + col];
|
| 349 |
-
mean_var.x() += xi;
|
| 350 |
-
mean_var.y() += xi * xi;
|
| 351 |
-
}
|
| 352 |
-
|
| 353 |
-
// sum up partial sums
|
| 354 |
-
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
| 355 |
-
if (block_size > WARP_SIZE) {
|
| 356 |
-
|
| 357 |
-
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
| 358 |
-
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
| 359 |
-
if (lane_id == 0) {
|
| 360 |
-
s_sum[warp_id] = mean_var;
|
| 361 |
-
}
|
| 362 |
-
/*
|
| 363 |
-
DPCT1118:0: SYCL group functions and algorithms must be encountered in
|
| 364 |
-
converged control flow. You may need to adjust the code.
|
| 365 |
-
*/
|
| 366 |
-
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 367 |
-
mean_var = s_sum[lane_id];
|
| 368 |
-
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
| 369 |
-
}
|
| 370 |
-
|
| 371 |
-
const float mean = mean_var.x() / ncols;
|
| 372 |
-
const float var = mean_var.y() / ncols - mean * mean;
|
| 373 |
-
const float inv_std = sycl::rsqrt(var + eps);
|
| 374 |
-
|
| 375 |
-
for (int col = tid; col < ncols; col += block_size) {
|
| 376 |
-
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
| 377 |
-
}
|
| 378 |
-
}
|
| 379 |
-
|
| 380 |
static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
|
| 381 |
const sycl::nd_item<3> &item_ct1) {
|
| 382 |
int nidx = item_ct1.get_local_id(2) +
|
|
@@ -444,126 +358,11 @@ static void pad_f32(const float *x, float *dst, const int ne0, const int ne00,
|
|
| 444 |
}
|
| 445 |
}
|
| 446 |
|
| 447 |
-
|
| 448 |
-
const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
|
| 449 |
-
int start = item_ct1.get_group(2) * group_size;
|
| 450 |
-
int end = start + group_size;
|
| 451 |
-
|
| 452 |
-
start += item_ct1.get_local_id(2);
|
| 453 |
-
|
| 454 |
-
if (end >= ne_elements) {
|
| 455 |
-
end = ne_elements;
|
| 456 |
-
}
|
| 457 |
-
|
| 458 |
-
float tmp = 0.0f; // partial sum for thread in warp
|
| 459 |
-
|
| 460 |
-
for (int j = start; j < end; j += block_size) {
|
| 461 |
-
tmp += x[j];
|
| 462 |
-
}
|
| 463 |
-
|
| 464 |
-
tmp = warp_reduce_sum(tmp, item_ct1);
|
| 465 |
-
if (block_size > WARP_SIZE) {
|
| 466 |
-
|
| 467 |
-
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
| 468 |
-
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
| 469 |
-
if (lane_id == 0) {
|
| 470 |
-
s_sum[warp_id] = tmp;
|
| 471 |
-
}
|
| 472 |
-
/*
|
| 473 |
-
DPCT1118:1: SYCL group functions and algorithms must be encountered in
|
| 474 |
-
converged control flow. You may need to adjust the code.
|
| 475 |
-
*/
|
| 476 |
-
/*
|
| 477 |
-
DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
|
| 478 |
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
| 479 |
-
better performance if there is no access to global memory.
|
| 480 |
-
*/
|
| 481 |
-
item_ct1.barrier();
|
| 482 |
-
tmp = s_sum[lane_id];
|
| 483 |
-
tmp = warp_reduce_sum(tmp, item_ct1);
|
| 484 |
-
}
|
| 485 |
-
|
| 486 |
-
float mean = tmp / group_size;
|
| 487 |
-
tmp = 0.0f;
|
| 488 |
-
|
| 489 |
-
for (int j = start; j < end; j += block_size) {
|
| 490 |
-
float xi = x[j] - mean;
|
| 491 |
-
dst[j] = xi;
|
| 492 |
-
tmp += xi * xi;
|
| 493 |
-
}
|
| 494 |
-
|
| 495 |
-
tmp = warp_reduce_sum(tmp, item_ct1);
|
| 496 |
-
if (block_size > WARP_SIZE) {
|
| 497 |
-
|
| 498 |
-
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
| 499 |
-
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
| 500 |
-
if (lane_id == 0) {
|
| 501 |
-
s_sum[warp_id] = tmp;
|
| 502 |
-
}
|
| 503 |
-
/*
|
| 504 |
-
DPCT1118:2: SYCL group functions and algorithms must be encountered in
|
| 505 |
-
converged control flow. You may need to adjust the code.
|
| 506 |
-
*/
|
| 507 |
-
/*
|
| 508 |
-
DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
|
| 509 |
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
| 510 |
-
better performance if there is no access to global memory.
|
| 511 |
-
*/
|
| 512 |
-
item_ct1.barrier();
|
| 513 |
-
tmp = s_sum[lane_id];
|
| 514 |
-
tmp = warp_reduce_sum(tmp, item_ct1);
|
| 515 |
-
}
|
| 516 |
-
|
| 517 |
-
float variance = tmp / group_size;
|
| 518 |
-
float scale = sycl::rsqrt(variance + eps);
|
| 519 |
-
for (int j = start; j < end; j += block_size) {
|
| 520 |
-
dst[j] *= scale;
|
| 521 |
-
}
|
| 522 |
-
}
|
| 523 |
-
|
| 524 |
-
static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
|
| 525 |
-
const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
|
| 526 |
-
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 527 |
-
item_ct1.get_local_id(1);
|
| 528 |
-
const int tid = item_ct1.get_local_id(2);
|
| 529 |
-
|
| 530 |
-
float tmp = 0.0f; // partial sum for thread in warp
|
| 531 |
-
|
| 532 |
-
for (int col = tid; col < ncols; col += block_size) {
|
| 533 |
-
const float xi = x[row*ncols + col];
|
| 534 |
-
tmp += xi * xi;
|
| 535 |
-
}
|
| 536 |
-
|
| 537 |
-
// sum up partial sums
|
| 538 |
-
tmp = warp_reduce_sum(tmp, item_ct1);
|
| 539 |
-
if (block_size > WARP_SIZE) {
|
| 540 |
-
|
| 541 |
-
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
| 542 |
-
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
| 543 |
-
if (lane_id == 0) {
|
| 544 |
-
s_sum[warp_id] = tmp;
|
| 545 |
-
}
|
| 546 |
-
/*
|
| 547 |
-
DPCT1118:3: SYCL group functions and algorithms must be encountered in
|
| 548 |
-
converged control flow. You may need to adjust the code.
|
| 549 |
-
*/
|
| 550 |
-
item_ct1.barrier(sycl::access::fence_space::local_space);
|
| 551 |
-
tmp = s_sum[lane_id];
|
| 552 |
-
tmp = warp_reduce_sum(tmp, item_ct1);
|
| 553 |
-
}
|
| 554 |
-
|
| 555 |
-
const float mean = tmp / ncols;
|
| 556 |
-
const float scale = sycl::rsqrt(mean + eps);
|
| 557 |
-
|
| 558 |
-
for (int col = tid; col < ncols; col += block_size) {
|
| 559 |
-
dst[row*ncols + col] = scale * x[row*ncols + col];
|
| 560 |
-
}
|
| 561 |
-
}
|
| 562 |
-
|
| 563 |
static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
|
| 564 |
const sycl::nd_item<3> &item_ct1) {
|
| 565 |
-
const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 566 |
-
|
| 567 |
|
| 568 |
if (ix >= kx_padded) {
|
| 569 |
return;
|
|
@@ -578,23 +377,39 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
|
| 578 |
|
| 579 |
const int ib = i_padded / QK8_1; // block index
|
| 580 |
const int iqs = i_padded % QK8_1; // quant index
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
#pragma unroll
|
| 587 |
-
for (int
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
}
|
|
|
|
|
|
|
| 593 |
|
| 594 |
const float d = amax / 127;
|
| 595 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
|
| 597 |
-
y[ib].qs[iqs] = q;
|
| 598 |
|
| 599 |
if (iqs > 0) {
|
| 600 |
return;
|
|
@@ -728,7 +543,7 @@ static void mul_mat_p021_f16_f32(
|
|
| 728 |
|
| 729 |
// sum up partial sums and write back result
|
| 730 |
#pragma unroll
|
| 731 |
-
for (int mask =
|
| 732 |
tmp +=
|
| 733 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 734 |
}
|
|
@@ -781,7 +596,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
| 781 |
|
| 782 |
// sum up partial sums and write back result
|
| 783 |
#pragma unroll
|
| 784 |
-
for (int mask =
|
| 785 |
tmp +=
|
| 786 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 787 |
}
|
|
@@ -1643,99 +1458,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
|
|
| 1643 |
});
|
| 1644 |
}
|
| 1645 |
|
| 1646 |
-
static void norm_f32_sycl(const float *x, float *dst, const int ncols,
|
| 1647 |
-
const int nrows, const float eps,
|
| 1648 |
-
queue_ptr stream) {
|
| 1649 |
-
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
| 1650 |
-
if (ncols < 1024) {
|
| 1651 |
-
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
| 1652 |
-
stream->submit([&](sycl::handler &cgh) {
|
| 1653 |
-
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
| 1654 |
-
sycl::range<1>(32), cgh);
|
| 1655 |
-
|
| 1656 |
-
cgh.parallel_for(
|
| 1657 |
-
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 1658 |
-
block_dims),
|
| 1659 |
-
[=](sycl::nd_item<3> item_ct1)
|
| 1660 |
-
[[intel::reqd_sub_group_size(32)]] {
|
| 1661 |
-
norm_f32(x, dst, ncols, eps, item_ct1,
|
| 1662 |
-
s_sum_acc_ct1.get_pointer(), WARP_SIZE);
|
| 1663 |
-
});
|
| 1664 |
-
});
|
| 1665 |
-
} else {
|
| 1666 |
-
const int work_group_size = get_work_group_size(stream->get_device());
|
| 1667 |
-
const sycl::range<3> block_dims(1, 1, work_group_size);
|
| 1668 |
-
/*
|
| 1669 |
-
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
|
| 1670 |
-
the limit. To get the device limit, query
|
| 1671 |
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
| 1672 |
-
*/
|
| 1673 |
-
stream->submit([&](sycl::handler &cgh) {
|
| 1674 |
-
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
| 1675 |
-
sycl::range<1>(32), cgh);
|
| 1676 |
-
|
| 1677 |
-
cgh.parallel_for(
|
| 1678 |
-
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 1679 |
-
block_dims),
|
| 1680 |
-
[=](sycl::nd_item<3> item_ct1)
|
| 1681 |
-
[[intel::reqd_sub_group_size(32)]] {
|
| 1682 |
-
norm_f32(x, dst, ncols, eps, item_ct1,
|
| 1683 |
-
s_sum_acc_ct1.get_pointer(), work_group_size);
|
| 1684 |
-
});
|
| 1685 |
-
});
|
| 1686 |
-
}
|
| 1687 |
-
}
|
| 1688 |
-
|
| 1689 |
-
static void group_norm_f32_sycl(const float *x, float *dst,
|
| 1690 |
-
const int num_groups, const int group_size,
|
| 1691 |
-
const int ne_elements, queue_ptr stream) {
|
| 1692 |
-
static const float eps = 1e-6f;
|
| 1693 |
-
if (group_size < 1024) {
|
| 1694 |
-
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
| 1695 |
-
stream->submit([&](sycl::handler &cgh) {
|
| 1696 |
-
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
| 1697 |
-
cgh);
|
| 1698 |
-
|
| 1699 |
-
const float eps_ct4 = eps;
|
| 1700 |
-
|
| 1701 |
-
cgh.parallel_for(
|
| 1702 |
-
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
| 1703 |
-
block_dims),
|
| 1704 |
-
[=](sycl::nd_item<3> item_ct1)
|
| 1705 |
-
[[intel::reqd_sub_group_size(32)]] {
|
| 1706 |
-
group_norm_f32(
|
| 1707 |
-
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
| 1708 |
-
s_sum_acc_ct1.get_pointer(), WARP_SIZE);
|
| 1709 |
-
});
|
| 1710 |
-
});
|
| 1711 |
-
} else {
|
| 1712 |
-
const int work_group_size = get_work_group_size(stream->get_device());
|
| 1713 |
-
const sycl::range<3> block_dims(1, 1, work_group_size);
|
| 1714 |
-
/*
|
| 1715 |
-
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
|
| 1716 |
-
the limit. To get the device limit, query
|
| 1717 |
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
| 1718 |
-
*/
|
| 1719 |
-
|
| 1720 |
-
stream->submit([&](sycl::handler &cgh) {
|
| 1721 |
-
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
| 1722 |
-
cgh);
|
| 1723 |
-
|
| 1724 |
-
const float eps_ct4 = eps;
|
| 1725 |
-
|
| 1726 |
-
cgh.parallel_for(
|
| 1727 |
-
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
| 1728 |
-
block_dims),
|
| 1729 |
-
[=](sycl::nd_item<3> item_ct1)
|
| 1730 |
-
[[intel::reqd_sub_group_size(32)]] {
|
| 1731 |
-
group_norm_f32(x, dst, group_size, ne_elements,
|
| 1732 |
-
eps_ct4, item_ct1,
|
| 1733 |
-
s_sum_acc_ct1.get_pointer(), work_group_size);
|
| 1734 |
-
});
|
| 1735 |
-
});
|
| 1736 |
-
}
|
| 1737 |
-
}
|
| 1738 |
-
|
| 1739 |
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
| 1740 |
const int ne0, int ne1, int ne2, int ne02,
|
| 1741 |
queue_ptr stream) {
|
|
@@ -1777,64 +1499,22 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
|
|
| 1777 |
});
|
| 1778 |
}
|
| 1779 |
|
| 1780 |
-
static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
|
| 1781 |
-
const int nrows, const float eps,
|
| 1782 |
-
queue_ptr stream) {
|
| 1783 |
-
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
| 1784 |
-
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
| 1785 |
-
if (ncols < 1024) {
|
| 1786 |
-
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
| 1787 |
-
stream->submit([&](sycl::handler &cgh) {
|
| 1788 |
-
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
| 1789 |
-
cgh);
|
| 1790 |
-
|
| 1791 |
-
cgh.parallel_for(
|
| 1792 |
-
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 1793 |
-
block_dims),
|
| 1794 |
-
[=](sycl::nd_item<3> item_ct1)
|
| 1795 |
-
[[intel::reqd_sub_group_size(32)]] {
|
| 1796 |
-
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
| 1797 |
-
s_sum_acc_ct1.get_pointer(), WARP_SIZE);
|
| 1798 |
-
});
|
| 1799 |
-
});
|
| 1800 |
-
} else {
|
| 1801 |
-
const int work_group_size = get_work_group_size(stream->get_device());
|
| 1802 |
-
const sycl::range<3> block_dims(1, 1, work_group_size);
|
| 1803 |
-
/*
|
| 1804 |
-
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
|
| 1805 |
-
the limit. To get the device limit, query
|
| 1806 |
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
| 1807 |
-
*/
|
| 1808 |
-
stream->submit([&](sycl::handler &cgh) {
|
| 1809 |
-
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
| 1810 |
-
cgh);
|
| 1811 |
-
|
| 1812 |
-
cgh.parallel_for(
|
| 1813 |
-
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
| 1814 |
-
block_dims),
|
| 1815 |
-
[=](sycl::nd_item<3> item_ct1)
|
| 1816 |
-
[[intel::reqd_sub_group_size(32)]] {
|
| 1817 |
-
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
| 1818 |
-
s_sum_acc_ct1.get_pointer(), work_group_size);
|
| 1819 |
-
});
|
| 1820 |
-
});
|
| 1821 |
-
}
|
| 1822 |
-
}
|
| 1823 |
-
|
| 1824 |
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
| 1825 |
const int ky, const int kx_padded,
|
| 1826 |
queue_ptr stream) {
|
| 1827 |
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
| 1828 |
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
| 1829 |
-
|
|
|
|
|
|
|
| 1830 |
{
|
| 1831 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 1832 |
{sycl::aspect::fp16});
|
| 1833 |
|
| 1834 |
stream->parallel_for(
|
| 1835 |
sycl::nd_range<3>(num_blocks * block_size, block_size),
|
| 1836 |
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
|
| 1837 |
-
quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
|
| 1838 |
});
|
| 1839 |
}
|
| 1840 |
}
|
|
@@ -1854,7 +1534,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
|
| 1854 |
|
| 1855 |
stream->parallel_for(
|
| 1856 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1857 |
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
|
| 1858 |
mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
|
| 1859 |
nchannels_y, item_ct1);
|
| 1860 |
});
|
|
@@ -1874,7 +1554,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
|
| 1874 |
|
| 1875 |
stream->parallel_for(
|
| 1876 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1877 |
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
|
| 1878 |
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
|
| 1879 |
row_stride_x, channel_stride_x,
|
| 1880 |
nchannels_y / nchannels_x, item_ct1);
|
|
@@ -2139,7 +1819,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
|
| 2139 |
const sycl::range<3> block_nums(1, nrows, 1);
|
| 2140 |
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 2141 |
[=](sycl::nd_item<3> item_ct1)
|
| 2142 |
-
[[intel::reqd_sub_group_size(
|
| 2143 |
k_sum_rows_f32(x, dst, ncols, item_ct1);
|
| 2144 |
});
|
| 2145 |
}
|
|
@@ -2220,7 +1900,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
|
|
| 2220 |
|
| 2221 |
cgh.parallel_for(
|
| 2222 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 2223 |
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
|
| 2224 |
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
| 2225 |
nrows_y, scale, max_bias, m0,
|
| 2226 |
m1, n_head_log2, item_ct1,
|
|
@@ -2400,12 +2080,6 @@ static inline int get_sycl_env(const char *env_name, int default_val) {
|
|
| 2400 |
return user_number;
|
| 2401 |
}
|
| 2402 |
|
| 2403 |
-
static inline int get_work_group_size(const sycl::device& device) {
|
| 2404 |
-
dpct::device_info prop;
|
| 2405 |
-
dpct::get_device_info(prop, device);
|
| 2406 |
-
return prop.get_max_work_group_size();
|
| 2407 |
-
}
|
| 2408 |
-
|
| 2409 |
static void ggml_check_sycl() try {
|
| 2410 |
static bool initialized = false;
|
| 2411 |
|
|
@@ -2964,45 +2638,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 2964 |
(void) src1_dd;
|
| 2965 |
}
|
| 2966 |
|
| 2967 |
-
inline void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
| 2968 |
-
ggml_tensor *dst, const float *src0_dd,
|
| 2969 |
-
const float *src1_dd, float *dst_dd,
|
| 2970 |
-
const queue_ptr &main_stream) {
|
| 2971 |
-
|
| 2972 |
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 2973 |
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 2974 |
-
|
| 2975 |
-
const int64_t ne00 = src0->ne[0];
|
| 2976 |
-
const int64_t nrows = ggml_nrows(src0);
|
| 2977 |
-
|
| 2978 |
-
float eps;
|
| 2979 |
-
memcpy(&eps, dst->op_params, sizeof(float));
|
| 2980 |
-
|
| 2981 |
-
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
| 2982 |
-
|
| 2983 |
-
(void) src1;
|
| 2984 |
-
(void) dst;
|
| 2985 |
-
(void) src1_dd;
|
| 2986 |
-
}
|
| 2987 |
-
|
| 2988 |
-
inline void ggml_sycl_op_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 2989 |
-
const ggml_tensor *src1, ggml_tensor *dst,
|
| 2990 |
-
const float *src0_dd, const float *src1_dd,
|
| 2991 |
-
float *dst_dd,
|
| 2992 |
-
const queue_ptr &main_stream) {
|
| 2993 |
-
|
| 2994 |
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 2995 |
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 2996 |
-
|
| 2997 |
-
int num_groups = dst->op_params[0];
|
| 2998 |
-
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
| 2999 |
-
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
| 3000 |
-
|
| 3001 |
-
(void) src1;
|
| 3002 |
-
(void) dst;
|
| 3003 |
-
(void) src1_dd;
|
| 3004 |
-
}
|
| 3005 |
-
|
| 3006 |
inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 3007 |
const ggml_tensor *src1, ggml_tensor *dst,
|
| 3008 |
const float *src0_dd, const float *src1_dd,
|
|
@@ -3066,28 +2701,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 3066 |
(void) src1_dd;
|
| 3067 |
}
|
| 3068 |
|
| 3069 |
-
inline void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 3070 |
-
const ggml_tensor *src1, ggml_tensor *dst,
|
| 3071 |
-
const float *src0_dd, const float *src1_dd,
|
| 3072 |
-
float *dst_dd,
|
| 3073 |
-
const queue_ptr &main_stream) {
|
| 3074 |
-
|
| 3075 |
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 3076 |
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 3077 |
-
|
| 3078 |
-
const int64_t ne00 = src0->ne[0];
|
| 3079 |
-
const int64_t nrows = ggml_nrows(src0);
|
| 3080 |
-
|
| 3081 |
-
float eps;
|
| 3082 |
-
memcpy(&eps, dst->op_params, sizeof(float));
|
| 3083 |
-
|
| 3084 |
-
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
| 3085 |
-
|
| 3086 |
-
(void) src1;
|
| 3087 |
-
(void) dst;
|
| 3088 |
-
(void) src1_dd;
|
| 3089 |
-
}
|
| 3090 |
-
|
| 3091 |
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
|
| 3092 |
int64_t min_compute_capability = INT_MAX;
|
| 3093 |
int64_t max_compute_capability = INT_MIN;
|
|
@@ -4273,7 +3886,6 @@ bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
|
| 4273 |
|
| 4274 |
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 4275 |
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
| 4276 |
-
|
| 4277 |
int64_t min_compute_capability = INT_MAX;
|
| 4278 |
|
| 4279 |
if (split) {
|
|
|
|
| 74 |
const float *src1_dd, float *dst_dd,
|
| 75 |
const queue_ptr &main_stream);
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
| 78 |
return b;
|
| 79 |
GGML_UNUSED(a);
|
|
|
|
| 291 |
dst[i] = x[i] * x[i];
|
| 292 |
}
|
| 293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
|
| 295 |
const sycl::nd_item<3> &item_ct1) {
|
| 296 |
int nidx = item_ct1.get_local_id(2) +
|
|
|
|
| 358 |
}
|
| 359 |
}
|
| 360 |
|
| 361 |
+
template<int QUANT_BLOCK_TILE>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
|
| 363 |
const sycl::nd_item<3> &item_ct1) {
|
| 364 |
+
const int ix = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
| 365 |
+
item_ct1.get_local_id(2)) * QUANT_BLOCK_TILE;
|
| 366 |
|
| 367 |
if (ix >= kx_padded) {
|
| 368 |
return;
|
|
|
|
| 377 |
|
| 378 |
const int ib = i_padded / QK8_1; // block index
|
| 379 |
const int iqs = i_padded % QK8_1; // quant index
|
| 380 |
+
typedef sycl::vec<float, QUANT_BLOCK_TILE> TC;
|
| 381 |
+
typedef sycl::vec<int8_t, QUANT_BLOCK_TILE> TQ;
|
| 382 |
+
TC zeros;
|
| 383 |
+
TQ qzeros;
|
|
|
|
| 384 |
#pragma unroll
|
| 385 |
+
for (int i = 0; i < QUANT_BLOCK_TILE; i++)
|
| 386 |
+
{
|
| 387 |
+
zeros[i] = 0.f;
|
| 388 |
+
qzeros[i] = 0;
|
| 389 |
+
}
|
| 390 |
+
const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros;
|
| 391 |
+
float sum = xi[0];
|
| 392 |
+
float amax = sycl::fabs(xi[0]);
|
| 393 |
+
#pragma unroll
|
| 394 |
+
for (int i = 1; i < QUANT_BLOCK_TILE; i++)
|
| 395 |
+
{
|
| 396 |
+
sum += xi[i];
|
| 397 |
+
amax = sycl::fmax(sycl::fabs(xi[i]), amax);
|
| 398 |
}
|
| 399 |
+
sum = warp_reduce_sum(sum, item_ct1);
|
| 400 |
+
amax = warp_reduce_max(amax, item_ct1);
|
| 401 |
|
| 402 |
const float d = amax / 127;
|
| 403 |
+
TQ q = qzeros;
|
| 404 |
+
if (amax != 0.0f)
|
| 405 |
+
{
|
| 406 |
+
#pragma unroll
|
| 407 |
+
for (int i = 0; i < QUANT_BLOCK_TILE; i++) {
|
| 408 |
+
q[i] = sycl::round(xi[i] / d);
|
| 409 |
+
}
|
| 410 |
+
}
|
| 411 |
|
| 412 |
+
*(TQ *)&y[ib].qs[iqs] = q;
|
| 413 |
|
| 414 |
if (iqs > 0) {
|
| 415 |
return;
|
|
|
|
| 543 |
|
| 544 |
// sum up partial sums and write back result
|
| 545 |
#pragma unroll
|
| 546 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 547 |
tmp +=
|
| 548 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 549 |
}
|
|
|
|
| 596 |
|
| 597 |
// sum up partial sums and write back result
|
| 598 |
#pragma unroll
|
| 599 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 600 |
tmp +=
|
| 601 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 602 |
}
|
|
|
|
| 1458 |
});
|
| 1459 |
}
|
| 1460 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1461 |
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
| 1462 |
const int ne0, int ne1, int ne2, int ne02,
|
| 1463 |
queue_ptr stream) {
|
|
|
|
| 1499 |
});
|
| 1500 |
}
|
| 1501 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1502 |
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
| 1503 |
const int ky, const int kx_padded,
|
| 1504 |
queue_ptr stream) {
|
| 1505 |
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
| 1506 |
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
| 1507 |
+
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
|
| 1508 |
+
static_assert(QK8_1 % WARP_SIZE == 0);
|
| 1509 |
+
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
|
| 1510 |
{
|
| 1511 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 1512 |
{sycl::aspect::fp16});
|
| 1513 |
|
| 1514 |
stream->parallel_for(
|
| 1515 |
sycl::nd_range<3>(num_blocks * block_size, block_size),
|
| 1516 |
+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 1517 |
+
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
| 1518 |
});
|
| 1519 |
}
|
| 1520 |
}
|
|
|
|
| 1534 |
|
| 1535 |
stream->parallel_for(
|
| 1536 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1537 |
+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 1538 |
mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
|
| 1539 |
nchannels_y, item_ct1);
|
| 1540 |
});
|
|
|
|
| 1554 |
|
| 1555 |
stream->parallel_for(
|
| 1556 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1557 |
+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 1558 |
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
|
| 1559 |
row_stride_x, channel_stride_x,
|
| 1560 |
nchannels_y / nchannels_x, item_ct1);
|
|
|
|
| 1819 |
const sycl::range<3> block_nums(1, nrows, 1);
|
| 1820 |
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1821 |
[=](sycl::nd_item<3> item_ct1)
|
| 1822 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 1823 |
k_sum_rows_f32(x, dst, ncols, item_ct1);
|
| 1824 |
});
|
| 1825 |
}
|
|
|
|
| 1900 |
|
| 1901 |
cgh.parallel_for(
|
| 1902 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1903 |
+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 1904 |
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
| 1905 |
nrows_y, scale, max_bias, m0,
|
| 1906 |
m1, n_head_log2, item_ct1,
|
|
|
|
| 2080 |
return user_number;
|
| 2081 |
}
|
| 2082 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2083 |
static void ggml_check_sycl() try {
|
| 2084 |
static bool initialized = false;
|
| 2085 |
|
|
|
|
| 2638 |
(void) src1_dd;
|
| 2639 |
}
|
| 2640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2641 |
inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 2642 |
const ggml_tensor *src1, ggml_tensor *dst,
|
| 2643 |
const float *src0_dd, const float *src1_dd,
|
|
|
|
| 2701 |
(void) src1_dd;
|
| 2702 |
}
|
| 2703 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2704 |
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
|
| 2705 |
int64_t min_compute_capability = INT_MAX;
|
| 2706 |
int64_t max_compute_capability = INT_MIN;
|
|
|
|
| 3886 |
|
| 3887 |
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3888 |
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
|
|
|
| 3889 |
int64_t min_compute_capability = INT_MAX;
|
| 3890 |
|
| 3891 |
if (split) {
|