KevinLy commited on
Commit
2dd429e
·
1 Parent(s): afa1447

Fix the sub group size of Intel (llama/8106)

Browse files

* use warp_size macro for all sycl kernels

* fix mask of permute_sub_group_by_xor

* fix rms_norm with correct warp number

* fix rms_norm_f32/group_norm_f32

* move norm to norm.cpp file

* fix quantize bug

* fix mmvq's batch size

Files changed (2) hide show
  1. ggml/src/CMakeLists.txt +3 -1
  2. ggml/src/ggml-sycl.cpp +42 -430
ggml/src/CMakeLists.txt CHANGED
@@ -486,9 +486,11 @@ if (GGML_SYCL)
486
  add_compile_options(-I./) #include DPCT
487
 
488
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
489
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
490
  if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
491
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
 
 
 
492
  endif()
493
 
494
  file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
 
486
  add_compile_options(-I./) #include DPCT
487
 
488
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
 
489
  if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
490
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
491
+ add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
492
+ else()
493
+ add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
494
  endif()
495
 
496
  file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
ggml/src/ggml-sycl.cpp CHANGED
@@ -74,51 +74,6 @@ typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const gg
74
  const float *src1_dd, float *dst_dd,
75
  const queue_ptr &main_stream);
76
 
77
- static __dpct_inline__ float warp_reduce_sum(float x,
78
- const sycl::nd_item<3> &item_ct1) {
79
- #pragma unroll
80
- for (int mask = 16; mask > 0; mask >>= 1) {
81
- /*
82
- DPCT1096:98: The right-most dimension of the work-group used in the SYCL
83
- kernel that calls this function may be less than "32". The function
84
- "dpct::permute_sub_group_by_xor" may return an unexpected result on the
85
- CPU device. Modify the size of the work-group to ensure that the value
86
- of the right-most dimension is a multiple of "32".
87
- */
88
- x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
89
- }
90
- return x;
91
- }
92
-
93
- static __dpct_inline__ sycl::float2
94
- warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
95
- #pragma unroll
96
- for (int mask = 16; mask > 0; mask >>= 1) {
97
- a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
98
- mask);
99
- a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
100
- mask);
101
- }
102
- return a;
103
- }
104
-
105
- static __dpct_inline__ float warp_reduce_max(float x,
106
- const sycl::nd_item<3> &item_ct1) {
107
- #pragma unroll
108
- for (int mask = 16; mask > 0; mask >>= 1) {
109
- /*
110
- DPCT1096:97: The right-most dimension of the work-group used in the SYCL
111
- kernel that calls this function may be less than "32". The function
112
- "dpct::permute_sub_group_by_xor" may return an unexpected result on the
113
- CPU device. Modify the size of the work-group to ensure that the value
114
- of the right-most dimension is a multiple of "32".
115
- */
116
- x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
117
- item_ct1.get_sub_group(), x, mask));
118
- }
119
- return x;
120
- }
121
-
122
  static __dpct_inline__ float op_repeat(const float a, const float b) {
123
  return b;
124
  GGML_UNUSED(a);
@@ -336,47 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
336
  dst[i] = x[i] * x[i];
337
  }
338
 
339
- static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
340
- const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum, int block_size) {
341
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
342
- item_ct1.get_local_id(1);
343
- const int tid = item_ct1.get_local_id(2);
344
-
345
- sycl::float2 mean_var = sycl::float2(0.f, 0.f);
346
-
347
- for (int col = tid; col < ncols; col += block_size) {
348
- const float xi = x[row*ncols + col];
349
- mean_var.x() += xi;
350
- mean_var.y() += xi * xi;
351
- }
352
-
353
- // sum up partial sums
354
- mean_var = warp_reduce_sum(mean_var, item_ct1);
355
- if (block_size > WARP_SIZE) {
356
-
357
- int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
358
- int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
359
- if (lane_id == 0) {
360
- s_sum[warp_id] = mean_var;
361
- }
362
- /*
363
- DPCT1118:0: SYCL group functions and algorithms must be encountered in
364
- converged control flow. You may need to adjust the code.
365
- */
366
- item_ct1.barrier(sycl::access::fence_space::local_space);
367
- mean_var = s_sum[lane_id];
368
- mean_var = warp_reduce_sum(mean_var, item_ct1);
369
- }
370
-
371
- const float mean = mean_var.x() / ncols;
372
- const float var = mean_var.y() / ncols - mean * mean;
373
- const float inv_std = sycl::rsqrt(var + eps);
374
-
375
- for (int col = tid; col < ncols; col += block_size) {
376
- dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
377
- }
378
- }
379
-
380
  static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
381
  const sycl::nd_item<3> &item_ct1) {
382
  int nidx = item_ct1.get_local_id(2) +
@@ -444,126 +358,11 @@ static void pad_f32(const float *x, float *dst, const int ne0, const int ne00,
444
  }
445
  }
446
 
447
- static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
448
- const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
449
- int start = item_ct1.get_group(2) * group_size;
450
- int end = start + group_size;
451
-
452
- start += item_ct1.get_local_id(2);
453
-
454
- if (end >= ne_elements) {
455
- end = ne_elements;
456
- }
457
-
458
- float tmp = 0.0f; // partial sum for thread in warp
459
-
460
- for (int j = start; j < end; j += block_size) {
461
- tmp += x[j];
462
- }
463
-
464
- tmp = warp_reduce_sum(tmp, item_ct1);
465
- if (block_size > WARP_SIZE) {
466
-
467
- int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
468
- int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
469
- if (lane_id == 0) {
470
- s_sum[warp_id] = tmp;
471
- }
472
- /*
473
- DPCT1118:1: SYCL group functions and algorithms must be encountered in
474
- converged control flow. You may need to adjust the code.
475
- */
476
- /*
477
- DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
478
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
479
- better performance if there is no access to global memory.
480
- */
481
- item_ct1.barrier();
482
- tmp = s_sum[lane_id];
483
- tmp = warp_reduce_sum(tmp, item_ct1);
484
- }
485
-
486
- float mean = tmp / group_size;
487
- tmp = 0.0f;
488
-
489
- for (int j = start; j < end; j += block_size) {
490
- float xi = x[j] - mean;
491
- dst[j] = xi;
492
- tmp += xi * xi;
493
- }
494
-
495
- tmp = warp_reduce_sum(tmp, item_ct1);
496
- if (block_size > WARP_SIZE) {
497
-
498
- int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
499
- int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
500
- if (lane_id == 0) {
501
- s_sum[warp_id] = tmp;
502
- }
503
- /*
504
- DPCT1118:2: SYCL group functions and algorithms must be encountered in
505
- converged control flow. You may need to adjust the code.
506
- */
507
- /*
508
- DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
509
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
510
- better performance if there is no access to global memory.
511
- */
512
- item_ct1.barrier();
513
- tmp = s_sum[lane_id];
514
- tmp = warp_reduce_sum(tmp, item_ct1);
515
- }
516
-
517
- float variance = tmp / group_size;
518
- float scale = sycl::rsqrt(variance + eps);
519
- for (int j = start; j < end; j += block_size) {
520
- dst[j] *= scale;
521
- }
522
- }
523
-
524
- static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
525
- const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
526
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
527
- item_ct1.get_local_id(1);
528
- const int tid = item_ct1.get_local_id(2);
529
-
530
- float tmp = 0.0f; // partial sum for thread in warp
531
-
532
- for (int col = tid; col < ncols; col += block_size) {
533
- const float xi = x[row*ncols + col];
534
- tmp += xi * xi;
535
- }
536
-
537
- // sum up partial sums
538
- tmp = warp_reduce_sum(tmp, item_ct1);
539
- if (block_size > WARP_SIZE) {
540
-
541
- int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
542
- int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
543
- if (lane_id == 0) {
544
- s_sum[warp_id] = tmp;
545
- }
546
- /*
547
- DPCT1118:3: SYCL group functions and algorithms must be encountered in
548
- converged control flow. You may need to adjust the code.
549
- */
550
- item_ct1.barrier(sycl::access::fence_space::local_space);
551
- tmp = s_sum[lane_id];
552
- tmp = warp_reduce_sum(tmp, item_ct1);
553
- }
554
-
555
- const float mean = tmp / ncols;
556
- const float scale = sycl::rsqrt(mean + eps);
557
-
558
- for (int col = tid; col < ncols; col += block_size) {
559
- dst[row*ncols + col] = scale * x[row*ncols + col];
560
- }
561
- }
562
-
563
  static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
564
  const sycl::nd_item<3> &item_ct1) {
565
- const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
566
- item_ct1.get_local_id(2);
567
 
568
  if (ix >= kx_padded) {
569
  return;
@@ -578,23 +377,39 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
578
 
579
  const int ib = i_padded / QK8_1; // block index
580
  const int iqs = i_padded % QK8_1; // quant index
581
-
582
- const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
583
- float amax = sycl::fabs((float)xi);
584
- float sum = xi;
585
-
586
  #pragma unroll
587
- for (int mask = 16; mask > 0; mask >>= 1) {
588
- amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
589
- item_ct1.get_sub_group(), amax, mask));
590
- sum +=
591
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask);
 
 
 
 
 
 
 
 
592
  }
 
 
593
 
594
  const float d = amax / 127;
595
- const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d);
 
 
 
 
 
 
 
596
 
597
- y[ib].qs[iqs] = q;
598
 
599
  if (iqs > 0) {
600
  return;
@@ -728,7 +543,7 @@ static void mul_mat_p021_f16_f32(
728
 
729
  // sum up partial sums and write back result
730
  #pragma unroll
731
- for (int mask = 16; mask > 0; mask >>= 1) {
732
  tmp +=
733
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
734
  }
@@ -781,7 +596,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
781
 
782
  // sum up partial sums and write back result
783
  #pragma unroll
784
- for (int mask = 16; mask > 0; mask >>= 1) {
785
  tmp +=
786
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
787
  }
@@ -1643,99 +1458,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
1643
  });
1644
  }
1645
 
1646
- static void norm_f32_sycl(const float *x, float *dst, const int ncols,
1647
- const int nrows, const float eps,
1648
- queue_ptr stream) {
1649
- GGML_ASSERT(ncols % WARP_SIZE == 0);
1650
- if (ncols < 1024) {
1651
- const sycl::range<3> block_dims(1, 1, WARP_SIZE);
1652
- stream->submit([&](sycl::handler &cgh) {
1653
- sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
1654
- sycl::range<1>(32), cgh);
1655
-
1656
- cgh.parallel_for(
1657
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
1658
- block_dims),
1659
- [=](sycl::nd_item<3> item_ct1)
1660
- [[intel::reqd_sub_group_size(32)]] {
1661
- norm_f32(x, dst, ncols, eps, item_ct1,
1662
- s_sum_acc_ct1.get_pointer(), WARP_SIZE);
1663
- });
1664
- });
1665
- } else {
1666
- const int work_group_size = get_work_group_size(stream->get_device());
1667
- const sycl::range<3> block_dims(1, 1, work_group_size);
1668
- /*
1669
- DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
1670
- the limit. To get the device limit, query
1671
- info::device::max_work_group_size. Adjust the work-group size if needed.
1672
- */
1673
- stream->submit([&](sycl::handler &cgh) {
1674
- sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
1675
- sycl::range<1>(32), cgh);
1676
-
1677
- cgh.parallel_for(
1678
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
1679
- block_dims),
1680
- [=](sycl::nd_item<3> item_ct1)
1681
- [[intel::reqd_sub_group_size(32)]] {
1682
- norm_f32(x, dst, ncols, eps, item_ct1,
1683
- s_sum_acc_ct1.get_pointer(), work_group_size);
1684
- });
1685
- });
1686
- }
1687
- }
1688
-
1689
- static void group_norm_f32_sycl(const float *x, float *dst,
1690
- const int num_groups, const int group_size,
1691
- const int ne_elements, queue_ptr stream) {
1692
- static const float eps = 1e-6f;
1693
- if (group_size < 1024) {
1694
- const sycl::range<3> block_dims(1, 1, WARP_SIZE);
1695
- stream->submit([&](sycl::handler &cgh) {
1696
- sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
1697
- cgh);
1698
-
1699
- const float eps_ct4 = eps;
1700
-
1701
- cgh.parallel_for(
1702
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
1703
- block_dims),
1704
- [=](sycl::nd_item<3> item_ct1)
1705
- [[intel::reqd_sub_group_size(32)]] {
1706
- group_norm_f32(
1707
- x, dst, group_size, ne_elements, eps_ct4, item_ct1,
1708
- s_sum_acc_ct1.get_pointer(), WARP_SIZE);
1709
- });
1710
- });
1711
- } else {
1712
- const int work_group_size = get_work_group_size(stream->get_device());
1713
- const sycl::range<3> block_dims(1, 1, work_group_size);
1714
- /*
1715
- DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
1716
- the limit. To get the device limit, query
1717
- info::device::max_work_group_size. Adjust the work-group size if needed.
1718
- */
1719
-
1720
- stream->submit([&](sycl::handler &cgh) {
1721
- sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
1722
- cgh);
1723
-
1724
- const float eps_ct4 = eps;
1725
-
1726
- cgh.parallel_for(
1727
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
1728
- block_dims),
1729
- [=](sycl::nd_item<3> item_ct1)
1730
- [[intel::reqd_sub_group_size(32)]] {
1731
- group_norm_f32(x, dst, group_size, ne_elements,
1732
- eps_ct4, item_ct1,
1733
- s_sum_acc_ct1.get_pointer(), work_group_size);
1734
- });
1735
- });
1736
- }
1737
- }
1738
-
1739
  static void concat_f32_sycl(const float *x, const float *y, float *dst,
1740
  const int ne0, int ne1, int ne2, int ne02,
1741
  queue_ptr stream) {
@@ -1777,64 +1499,22 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
1777
  });
1778
  }
1779
 
1780
- static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
1781
- const int nrows, const float eps,
1782
- queue_ptr stream) {
1783
- GGML_ASSERT(ncols % WARP_SIZE == 0);
1784
- // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
1785
- if (ncols < 1024) {
1786
- const sycl::range<3> block_dims(1, 1, WARP_SIZE);
1787
- stream->submit([&](sycl::handler &cgh) {
1788
- sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
1789
- cgh);
1790
-
1791
- cgh.parallel_for(
1792
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
1793
- block_dims),
1794
- [=](sycl::nd_item<3> item_ct1)
1795
- [[intel::reqd_sub_group_size(32)]] {
1796
- rms_norm_f32(x, dst, ncols, eps, item_ct1,
1797
- s_sum_acc_ct1.get_pointer(), WARP_SIZE);
1798
- });
1799
- });
1800
- } else {
1801
- const int work_group_size = get_work_group_size(stream->get_device());
1802
- const sycl::range<3> block_dims(1, 1, work_group_size);
1803
- /*
1804
- DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
1805
- the limit. To get the device limit, query
1806
- info::device::max_work_group_size. Adjust the work-group size if needed.
1807
- */
1808
- stream->submit([&](sycl::handler &cgh) {
1809
- sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
1810
- cgh);
1811
-
1812
- cgh.parallel_for(
1813
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
1814
- block_dims),
1815
- [=](sycl::nd_item<3> item_ct1)
1816
- [[intel::reqd_sub_group_size(32)]] {
1817
- rms_norm_f32(x, dst, ncols, eps, item_ct1,
1818
- s_sum_acc_ct1.get_pointer(), work_group_size);
1819
- });
1820
- });
1821
- }
1822
- }
1823
-
1824
  static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1825
  const int ky, const int kx_padded,
1826
  queue_ptr stream) {
1827
  const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
1828
  const sycl::range<3> num_blocks(1, ky, block_num_x);
1829
- const sycl::range<3> block_size(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE);
 
 
1830
  {
1831
  dpct::has_capability_or_fail(stream->get_device(),
1832
  {sycl::aspect::fp16});
1833
 
1834
  stream->parallel_for(
1835
  sycl::nd_range<3>(num_blocks * block_size, block_size),
1836
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
1837
- quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
1838
  });
1839
  }
1840
  }
@@ -1854,7 +1534,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
1854
 
1855
  stream->parallel_for(
1856
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1857
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
1858
  mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
1859
  nchannels_y, item_ct1);
1860
  });
@@ -1874,7 +1554,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
1874
 
1875
  stream->parallel_for(
1876
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1877
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
1878
  mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
1879
  row_stride_x, channel_stride_x,
1880
  nchannels_y / nchannels_x, item_ct1);
@@ -2139,7 +1819,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
2139
  const sycl::range<3> block_nums(1, nrows, 1);
2140
  stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
2141
  [=](sycl::nd_item<3> item_ct1)
2142
- [[intel::reqd_sub_group_size(32)]] {
2143
  k_sum_rows_f32(x, dst, ncols, item_ct1);
2144
  });
2145
  }
@@ -2220,7 +1900,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
2220
 
2221
  cgh.parallel_for(
2222
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
2223
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
2224
  soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
2225
  nrows_y, scale, max_bias, m0,
2226
  m1, n_head_log2, item_ct1,
@@ -2400,12 +2080,6 @@ static inline int get_sycl_env(const char *env_name, int default_val) {
2400
  return user_number;
2401
  }
2402
 
2403
- static inline int get_work_group_size(const sycl::device& device) {
2404
- dpct::device_info prop;
2405
- dpct::get_device_info(prop, device);
2406
- return prop.get_max_work_group_size();
2407
- }
2408
-
2409
  static void ggml_check_sycl() try {
2410
  static bool initialized = false;
2411
 
@@ -2964,45 +2638,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
2964
  (void) src1_dd;
2965
  }
2966
 
2967
- inline void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
2968
- ggml_tensor *dst, const float *src0_dd,
2969
- const float *src1_dd, float *dst_dd,
2970
- const queue_ptr &main_stream) {
2971
-
2972
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
2973
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
2974
-
2975
- const int64_t ne00 = src0->ne[0];
2976
- const int64_t nrows = ggml_nrows(src0);
2977
-
2978
- float eps;
2979
- memcpy(&eps, dst->op_params, sizeof(float));
2980
-
2981
- norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
2982
-
2983
- (void) src1;
2984
- (void) dst;
2985
- (void) src1_dd;
2986
- }
2987
-
2988
- inline void ggml_sycl_op_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2989
- const ggml_tensor *src1, ggml_tensor *dst,
2990
- const float *src0_dd, const float *src1_dd,
2991
- float *dst_dd,
2992
- const queue_ptr &main_stream) {
2993
-
2994
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
2995
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
2996
-
2997
- int num_groups = dst->op_params[0];
2998
- int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
2999
- group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
3000
-
3001
- (void) src1;
3002
- (void) dst;
3003
- (void) src1_dd;
3004
- }
3005
-
3006
  inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
3007
  const ggml_tensor *src1, ggml_tensor *dst,
3008
  const float *src0_dd, const float *src1_dd,
@@ -3066,28 +2701,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
3066
  (void) src1_dd;
3067
  }
3068
 
3069
- inline void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
3070
- const ggml_tensor *src1, ggml_tensor *dst,
3071
- const float *src0_dd, const float *src1_dd,
3072
- float *dst_dd,
3073
- const queue_ptr &main_stream) {
3074
-
3075
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
3076
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
3077
-
3078
- const int64_t ne00 = src0->ne[0];
3079
- const int64_t nrows = ggml_nrows(src0);
3080
-
3081
- float eps;
3082
- memcpy(&eps, dst->op_params, sizeof(float));
3083
-
3084
- rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
3085
-
3086
- (void) src1;
3087
- (void) dst;
3088
- (void) src1_dd;
3089
- }
3090
-
3091
  static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
3092
  int64_t min_compute_capability = INT_MAX;
3093
  int64_t max_compute_capability = INT_MIN;
@@ -4273,7 +3886,6 @@ bool ggml_sycl_supports_dmmv(enum ggml_type type) {
4273
 
4274
  static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4275
  const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
4276
-
4277
  int64_t min_compute_capability = INT_MAX;
4278
 
4279
  if (split) {
 
74
  const float *src1_dd, float *dst_dd,
75
  const queue_ptr &main_stream);
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  static __dpct_inline__ float op_repeat(const float a, const float b) {
78
  return b;
79
  GGML_UNUSED(a);
 
291
  dst[i] = x[i] * x[i];
292
  }
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
295
  const sycl::nd_item<3> &item_ct1) {
296
  int nidx = item_ct1.get_local_id(2) +
 
358
  }
359
  }
360
 
361
+ template<int QUANT_BLOCK_TILE>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
363
  const sycl::nd_item<3> &item_ct1) {
364
+ const int ix = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
365
+ item_ct1.get_local_id(2)) * QUANT_BLOCK_TILE;
366
 
367
  if (ix >= kx_padded) {
368
  return;
 
377
 
378
  const int ib = i_padded / QK8_1; // block index
379
  const int iqs = i_padded % QK8_1; // quant index
380
+ typedef sycl::vec<float, QUANT_BLOCK_TILE> TC;
381
+ typedef sycl::vec<int8_t, QUANT_BLOCK_TILE> TQ;
382
+ TC zeros;
383
+ TQ qzeros;
 
384
  #pragma unroll
385
+ for (int i = 0; i < QUANT_BLOCK_TILE; i++)
386
+ {
387
+ zeros[i] = 0.f;
388
+ qzeros[i] = 0;
389
+ }
390
+ const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros;
391
+ float sum = xi[0];
392
+ float amax = sycl::fabs(xi[0]);
393
+ #pragma unroll
394
+ for (int i = 1; i < QUANT_BLOCK_TILE; i++)
395
+ {
396
+ sum += xi[i];
397
+ amax = sycl::fmax(sycl::fabs(xi[i]), amax);
398
  }
399
+ sum = warp_reduce_sum(sum, item_ct1);
400
+ amax = warp_reduce_max(amax, item_ct1);
401
 
402
  const float d = amax / 127;
403
+ TQ q = qzeros;
404
+ if (amax != 0.0f)
405
+ {
406
+ #pragma unroll
407
+ for (int i = 0; i < QUANT_BLOCK_TILE; i++) {
408
+ q[i] = sycl::round(xi[i] / d);
409
+ }
410
+ }
411
 
412
+ *(TQ *)&y[ib].qs[iqs] = q;
413
 
414
  if (iqs > 0) {
415
  return;
 
543
 
544
  // sum up partial sums and write back result
545
  #pragma unroll
546
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
547
  tmp +=
548
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
549
  }
 
596
 
597
  // sum up partial sums and write back result
598
  #pragma unroll
599
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
600
  tmp +=
601
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
602
  }
 
1458
  });
1459
  }
1460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1461
  static void concat_f32_sycl(const float *x, const float *y, float *dst,
1462
  const int ne0, int ne1, int ne2, int ne02,
1463
  queue_ptr stream) {
 
1499
  });
1500
  }
1501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1502
  static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1503
  const int ky, const int kx_padded,
1504
  queue_ptr stream) {
1505
  const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
1506
  const sycl::range<3> num_blocks(1, ky, block_num_x);
1507
+ int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
1508
+ static_assert(QK8_1 % WARP_SIZE == 0);
1509
+ const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
1510
  {
1511
  dpct::has_capability_or_fail(stream->get_device(),
1512
  {sycl::aspect::fp16});
1513
 
1514
  stream->parallel_for(
1515
  sycl::nd_range<3>(num_blocks * block_size, block_size),
1516
+ [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1517
+ quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
1518
  });
1519
  }
1520
  }
 
1534
 
1535
  stream->parallel_for(
1536
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1537
+ [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1538
  mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
1539
  nchannels_y, item_ct1);
1540
  });
 
1554
 
1555
  stream->parallel_for(
1556
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1557
+ [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1558
  mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
1559
  row_stride_x, channel_stride_x,
1560
  nchannels_y / nchannels_x, item_ct1);
 
1819
  const sycl::range<3> block_nums(1, nrows, 1);
1820
  stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
1821
  [=](sycl::nd_item<3> item_ct1)
1822
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1823
  k_sum_rows_f32(x, dst, ncols, item_ct1);
1824
  });
1825
  }
 
1900
 
1901
  cgh.parallel_for(
1902
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1903
+ [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1904
  soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
1905
  nrows_y, scale, max_bias, m0,
1906
  m1, n_head_log2, item_ct1,
 
2080
  return user_number;
2081
  }
2082
 
 
 
 
 
 
 
2083
  static void ggml_check_sycl() try {
2084
  static bool initialized = false;
2085
 
 
2638
  (void) src1_dd;
2639
  }
2640
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2641
  inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2642
  const ggml_tensor *src1, ggml_tensor *dst,
2643
  const float *src0_dd, const float *src1_dd,
 
2701
  (void) src1_dd;
2702
  }
2703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2704
  static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
2705
  int64_t min_compute_capability = INT_MAX;
2706
  int64_t max_compute_capability = INT_MIN;
 
3886
 
3887
  static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3888
  const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
 
3889
  int64_t min_compute_capability = INT_MAX;
3890
 
3891
  if (split) {