hengyu commited on
Commit
7a97623
·
unverified ·
1 Parent(s): 1ff7b08

Disable iqx on windows as WA (llama/6435)

Browse files

* disable iqx on windows as WA

* array instead of global_memory

Files changed (2) hide show
  1. ggml-common.h +3 -2
  2. ggml-sycl.cpp +38 -114
ggml-common.h CHANGED
@@ -447,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
447
 
448
  #define GGML_COMMON_IMPL
449
  #elif defined(GGML_COMMON_IMPL_SYCL)
 
450
  #include <cstdint>
451
 
452
- #define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory<const type, 1> name(sycl::range<1>(size), {
453
- #define GGML_TABLE_END() });
454
 
455
  #define GGML_COMMON_IMPL
456
  #endif
 
447
 
448
  #define GGML_COMMON_IMPL
449
  #elif defined(GGML_COMMON_IMPL_SYCL)
450
+
451
  #include <cstdint>
452
 
453
+ #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
454
+ #define GGML_TABLE_END() };
455
 
456
  #define GGML_COMMON_IMPL
457
  #endif
ggml-sycl.cpp CHANGED
@@ -8079,7 +8079,7 @@ template <bool need_check> static void
8079
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
8080
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8081
  const sycl::nd_item<3> &item_ct1,
8082
- const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
8083
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8084
  item_ct1.get_local_id(1);
8085
 
@@ -9956,17 +9956,14 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
9956
  dpct::queue_ptr stream) {
9957
  const int nb = k / QK_K;
9958
  {
9959
- iq2xxs_grid.init(*stream);
9960
- ksigns_iq2xs.init(*stream);
9961
- kmask_iq2xs.init(*stream);
9962
 
9963
  dpct::has_capability_or_fail(stream->get_device(),
9964
  {sycl::aspect::fp16});
9965
 
9966
  stream->submit([&](sycl::handler &cgh) {
9967
- auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
9968
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
9969
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
9970
 
9971
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
9972
  sycl::range<3>(1, 1, 32),
@@ -9985,17 +9982,14 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
9985
  dpct::queue_ptr stream) {
9986
  const int nb = k / QK_K;
9987
  {
9988
- iq2xs_grid.init(*stream);
9989
- ksigns_iq2xs.init(*stream);
9990
- kmask_iq2xs.init(*stream);
9991
 
9992
  dpct::has_capability_or_fail(stream->get_device(),
9993
  {sycl::aspect::fp16});
9994
 
9995
  stream->submit([&](sycl::handler &cgh) {
9996
- auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
9997
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
9998
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
9999
 
10000
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10001
  sycl::range<3>(1, 1, 32),
@@ -10014,17 +10008,14 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
10014
  dpct::queue_ptr stream) {
10015
  const int nb = k / QK_K;
10016
  {
10017
- iq3xxs_grid.init(*stream);
10018
- ksigns_iq2xs.init(*stream);
10019
- kmask_iq2xs.init(*stream);
10020
 
10021
  dpct::has_capability_or_fail(stream->get_device(),
10022
  {sycl::aspect::fp16});
10023
 
10024
  stream->submit([&](sycl::handler &cgh) {
10025
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10026
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10027
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10028
 
10029
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10030
  sycl::range<3>(1, 1, 32),
@@ -10043,17 +10034,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
10043
  dpct::queue_ptr stream) {
10044
  const int nb = k / QK_K;
10045
  {
10046
- iq3s_grid.init(*stream);
10047
- ksigns_iq2xs.init(*stream);
10048
- kmask_iq2xs.init(*stream);
10049
 
10050
  dpct::has_capability_or_fail(stream->get_device(),
10051
  {sycl::aspect::fp16});
10052
 
10053
  stream->submit([&](sycl::handler &cgh) {
10054
- auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10055
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10056
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10057
 
10058
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10059
  sycl::range<3>(1, 1, 32),
@@ -10072,17 +10060,14 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
10072
  dpct::queue_ptr stream) {
10073
  const int nb = k / QK_K;
10074
  {
10075
- iq1s_grid_gpu.init(*stream);
10076
- ksigns_iq2xs.init(*stream);
10077
- kmask_iq2xs.init(*stream);
10078
 
10079
  dpct::has_capability_or_fail(stream->get_device(),
10080
  {sycl::aspect::fp16});
10081
 
10082
  stream->submit([&](sycl::handler &cgh) {
10083
- auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10084
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10085
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10086
 
10087
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10088
  sycl::range<3>(1, 1, 32),
@@ -10415,12 +10400,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
10415
  const sycl::range<3> block_nums(1, 1, block_num_y);
10416
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10417
  {
10418
- iq3xxs_grid.init(*stream);
10419
- ksigns64.init(*stream);
10420
 
10421
  stream->submit([&](sycl::handler &cgh) {
10422
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10423
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10424
 
10425
  cgh.parallel_for(
10426
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10428,8 +10409,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
10428
  [[intel::reqd_sub_group_size(32)]] {
10429
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
10430
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
10431
- vx, vy, dst, ncols, nrows, item_ct1,
10432
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10433
  });
10434
  });
10435
  }
@@ -10444,12 +10424,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
10444
  const sycl::range<3> block_nums(1, 1, block_num_y);
10445
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10446
  {
10447
- iq3xxs_grid.init(*stream);
10448
- ksigns64.init(*stream);
10449
 
10450
  stream->submit([&](sycl::handler &cgh) {
10451
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10452
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10453
 
10454
  cgh.parallel_for(
10455
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10457,8 +10433,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
10457
  [[intel::reqd_sub_group_size(32)]] {
10458
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
10459
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
10460
- vx, vy, dst, ncols, nrows, item_ct1,
10461
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10462
  });
10463
  });
10464
  }
@@ -10473,12 +10448,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
10473
  const sycl::range<3> block_nums(1, 1, block_num_y);
10474
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10475
  {
10476
- iq3xxs_grid.init(*stream);
10477
- ksigns64.init(*stream);
10478
 
10479
  stream->submit([&](sycl::handler &cgh) {
10480
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10481
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10482
 
10483
  cgh.parallel_for(
10484
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10486,8 +10457,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
10486
  [[intel::reqd_sub_group_size(32)]] {
10487
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
10488
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
10489
- vx, vy, dst, ncols, nrows, item_ct1,
10490
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10491
  });
10492
  });
10493
  }
@@ -10502,12 +10472,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
10502
  const sycl::range<3> block_nums(1, 1, block_num_y);
10503
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10504
  {
10505
- iq3xxs_grid.init(*stream);
10506
- ksigns64.init(*stream);
10507
 
10508
  stream->submit([&](sycl::handler &cgh) {
10509
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10510
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10511
 
10512
  cgh.parallel_for(
10513
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10515,8 +10481,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
10515
  [[intel::reqd_sub_group_size(32)]] {
10516
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
10517
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
10518
- vx, vy, dst, ncols, nrows, item_ct1,
10519
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10520
  });
10521
  });
10522
  }
@@ -10531,12 +10496,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
10531
  const sycl::range<3> block_nums(1, 1, block_num_y);
10532
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10533
  {
10534
- iq3xxs_grid.init(*stream);
10535
- ksigns64.init(*stream);
10536
 
10537
  stream->submit([&](sycl::handler &cgh) {
10538
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10539
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10540
 
10541
  cgh.parallel_for(
10542
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10544,8 +10505,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
10544
  [[intel::reqd_sub_group_size(32)]] {
10545
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
10546
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
10547
- vx, vy, dst, ncols, nrows, item_ct1,
10548
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10549
  });
10550
  });
10551
  }
@@ -10560,12 +10520,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
10560
  const sycl::range<3> block_nums(1, 1, block_num_y);
10561
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10562
  {
10563
- iq3xxs_grid.init(*stream);
10564
- ksigns64.init(*stream);
10565
 
10566
  stream->submit([&](sycl::handler &cgh) {
10567
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10568
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10569
 
10570
  cgh.parallel_for(
10571
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10573,8 +10529,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
10573
  [[intel::reqd_sub_group_size(32)]] {
10574
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
10575
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
10576
- vx, vy, dst, ncols, nrows, item_ct1,
10577
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10578
  });
10579
  });
10580
  }
@@ -10589,12 +10544,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
10589
  const sycl::range<3> block_nums(1, 1, block_num_y);
10590
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10591
  {
10592
- iq3xxs_grid.init(*stream);
10593
- ksigns64.init(*stream);
10594
 
10595
  stream->submit([&](sycl::handler &cgh) {
10596
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10597
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10598
 
10599
  cgh.parallel_for(
10600
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10602,8 +10553,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
10602
  [[intel::reqd_sub_group_size(32)]] {
10603
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
10604
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
10605
- vx, vy, dst, ncols, nrows, item_ct1,
10606
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10607
  });
10608
  });
10609
  }
@@ -10618,12 +10568,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
10618
  const sycl::range<3> block_nums(1, 1, block_num_y);
10619
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10620
  {
10621
- iq3xxs_grid.init(*stream);
10622
- ksigns64.init(*stream);
10623
 
10624
  stream->submit([&](sycl::handler &cgh) {
10625
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10626
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10627
 
10628
  cgh.parallel_for(
10629
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10631,8 +10577,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
10631
  [[intel::reqd_sub_group_size(32)]] {
10632
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
10633
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
10634
- vx, vy, dst, ncols, nrows, item_ct1,
10635
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10636
  });
10637
  });
10638
  }
@@ -10647,12 +10592,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
10647
  const sycl::range<3> block_nums(1, 1, block_num_y);
10648
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10649
  {
10650
- iq3xxs_grid.init(*stream);
10651
- ksigns64.init(*stream);
10652
 
10653
  stream->submit([&](sycl::handler &cgh) {
10654
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10655
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10656
 
10657
  cgh.parallel_for(
10658
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10660,8 +10601,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
10660
  [[intel::reqd_sub_group_size(32)]] {
10661
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
10662
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
10663
- vx, vy, dst, ncols, nrows, item_ct1,
10664
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10665
  });
10666
  });
10667
  }
@@ -10676,12 +10616,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
10676
  const sycl::range<3> block_nums(1, 1, block_num_y);
10677
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10678
  {
10679
- iq3xxs_grid.init(*stream);
10680
- ksigns64.init(*stream);
10681
 
10682
  stream->submit([&](sycl::handler &cgh) {
10683
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10684
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10685
 
10686
  cgh.parallel_for(
10687
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10689,13 +10625,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
10689
  [[intel::reqd_sub_group_size(32)]] {
10690
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
10691
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
10692
- vx, vy, dst, ncols, nrows, item_ct1,
10693
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10694
  });
10695
  });
10696
  }
10697
  }
10698
 
 
10699
  static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10700
  float *dst, const int ncols,
10701
  const int nrows,
@@ -10705,15 +10641,11 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10705
  const sycl::range<3> block_nums(1, 1, block_num_y);
10706
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10707
  {
10708
- iq2xxs_grid.init(*stream);
10709
- ksigns_iq2xs.init(*stream);
10710
- kmask_iq2xs.init(*stream);
10711
-
10712
 
10713
  stream->submit([&](sycl::handler &cgh) {
10714
- auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
10715
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10716
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10717
 
10718
  cgh.parallel_for(
10719
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10736,12 +10668,10 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
10736
  const sycl::range<3> block_nums(1, 1, block_num_y);
10737
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10738
  {
10739
- iq2xs_grid.init(*stream);
10740
- ksigns64.init(*stream);
10741
 
10742
  stream->submit([&](sycl::handler &cgh) {
10743
- auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
10744
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10745
 
10746
  cgh.parallel_for(
10747
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10764,12 +10694,10 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
10764
  const sycl::range<3> block_nums(1, 1, block_num_y);
10765
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10766
  {
10767
- iq3xxs_grid.init(*stream);
10768
- ksigns64.init(*stream);
10769
 
10770
  stream->submit([&](sycl::handler &cgh) {
10771
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10772
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10773
 
10774
  cgh.parallel_for(
10775
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10792,12 +10720,10 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
10792
  const sycl::range<3> block_nums(1, 1, block_num_y);
10793
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10794
  {
10795
- iq3s_grid.init(*stream);
10796
- ksigns64.init(*stream);
10797
 
10798
  stream->submit([&](sycl::handler &cgh) {
10799
- auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10800
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10801
 
10802
  cgh.parallel_for(
10803
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10820,12 +10746,10 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
10820
  const sycl::range<3> block_nums(1, 1, block_num_y);
10821
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10822
  {
10823
- iq1s_grid_gpu.init(*stream);
10824
- ksigns64.init(*stream);
10825
 
10826
  stream->submit([&](sycl::handler &cgh) {
10827
- auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10828
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10829
 
10830
  cgh.parallel_for(
10831
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
8079
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
8080
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8081
  const sycl::nd_item<3> &item_ct1,
8082
+ const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) {
8083
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8084
  item_ct1.get_local_id(1);
8085
 
 
9956
  dpct::queue_ptr stream) {
9957
  const int nb = k / QK_K;
9958
  {
 
 
 
9959
 
9960
  dpct::has_capability_or_fail(stream->get_device(),
9961
  {sycl::aspect::fp16});
9962
 
9963
  stream->submit([&](sycl::handler &cgh) {
9964
+ auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
9965
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
9966
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
9967
 
9968
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
9969
  sycl::range<3>(1, 1, 32),
 
9982
  dpct::queue_ptr stream) {
9983
  const int nb = k / QK_K;
9984
  {
 
 
 
9985
 
9986
  dpct::has_capability_or_fail(stream->get_device(),
9987
  {sycl::aspect::fp16});
9988
 
9989
  stream->submit([&](sycl::handler &cgh) {
9990
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
9991
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
9992
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
9993
 
9994
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
9995
  sycl::range<3>(1, 1, 32),
 
10008
  dpct::queue_ptr stream) {
10009
  const int nb = k / QK_K;
10010
  {
 
 
 
10011
 
10012
  dpct::has_capability_or_fail(stream->get_device(),
10013
  {sycl::aspect::fp16});
10014
 
10015
  stream->submit([&](sycl::handler &cgh) {
10016
+ auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
10017
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10018
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10019
 
10020
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10021
  sycl::range<3>(1, 1, 32),
 
10034
  dpct::queue_ptr stream) {
10035
  const int nb = k / QK_K;
10036
  {
 
 
 
10037
 
10038
  dpct::has_capability_or_fail(stream->get_device(),
10039
  {sycl::aspect::fp16});
10040
 
10041
  stream->submit([&](sycl::handler &cgh) {
10042
+ auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
10043
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10044
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10045
 
10046
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10047
  sycl::range<3>(1, 1, 32),
 
10060
  dpct::queue_ptr stream) {
10061
  const int nb = k / QK_K;
10062
  {
 
 
 
10063
 
10064
  dpct::has_capability_or_fail(stream->get_device(),
10065
  {sycl::aspect::fp16});
10066
 
10067
  stream->submit([&](sycl::handler &cgh) {
10068
+ auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
10069
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10070
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10071
 
10072
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10073
  sycl::range<3>(1, 1, 32),
 
10400
  const sycl::range<3> block_nums(1, 1, block_num_y);
10401
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10402
  {
 
 
10403
 
10404
  stream->submit([&](sycl::handler &cgh) {
 
 
10405
 
10406
  cgh.parallel_for(
10407
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10409
  [[intel::reqd_sub_group_size(32)]] {
10410
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
10411
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
10412
+ vx, vy, dst, ncols, nrows, item_ct1);
 
10413
  });
10414
  });
10415
  }
 
10424
  const sycl::range<3> block_nums(1, 1, block_num_y);
10425
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10426
  {
 
 
10427
 
10428
  stream->submit([&](sycl::handler &cgh) {
 
 
10429
 
10430
  cgh.parallel_for(
10431
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10433
  [[intel::reqd_sub_group_size(32)]] {
10434
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
10435
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
10436
+ vx, vy, dst, ncols, nrows, item_ct1);
 
10437
  });
10438
  });
10439
  }
 
10448
  const sycl::range<3> block_nums(1, 1, block_num_y);
10449
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10450
  {
 
 
10451
 
10452
  stream->submit([&](sycl::handler &cgh) {
 
 
10453
 
10454
  cgh.parallel_for(
10455
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10457
  [[intel::reqd_sub_group_size(32)]] {
10458
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
10459
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
10460
+ vx, vy, dst, ncols, nrows, item_ct1);
 
10461
  });
10462
  });
10463
  }
 
10472
  const sycl::range<3> block_nums(1, 1, block_num_y);
10473
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10474
  {
 
 
10475
 
10476
  stream->submit([&](sycl::handler &cgh) {
 
 
10477
 
10478
  cgh.parallel_for(
10479
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10481
  [[intel::reqd_sub_group_size(32)]] {
10482
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
10483
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
10484
+ vx, vy, dst, ncols, nrows, item_ct1);
 
10485
  });
10486
  });
10487
  }
 
10496
  const sycl::range<3> block_nums(1, 1, block_num_y);
10497
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10498
  {
 
 
10499
 
10500
  stream->submit([&](sycl::handler &cgh) {
 
 
10501
 
10502
  cgh.parallel_for(
10503
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10505
  [[intel::reqd_sub_group_size(32)]] {
10506
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
10507
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
10508
+ vx, vy, dst, ncols, nrows, item_ct1);
 
10509
  });
10510
  });
10511
  }
 
10520
  const sycl::range<3> block_nums(1, 1, block_num_y);
10521
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10522
  {
 
 
10523
 
10524
  stream->submit([&](sycl::handler &cgh) {
 
 
10525
 
10526
  cgh.parallel_for(
10527
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10529
  [[intel::reqd_sub_group_size(32)]] {
10530
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
10531
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
10532
+ vx, vy, dst, ncols, nrows, item_ct1);
 
10533
  });
10534
  });
10535
  }
 
10544
  const sycl::range<3> block_nums(1, 1, block_num_y);
10545
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10546
  {
 
 
10547
 
10548
  stream->submit([&](sycl::handler &cgh) {
 
 
10549
 
10550
  cgh.parallel_for(
10551
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10553
  [[intel::reqd_sub_group_size(32)]] {
10554
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
10555
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
10556
+ vx, vy, dst, ncols, nrows, item_ct1);
 
10557
  });
10558
  });
10559
  }
 
10568
  const sycl::range<3> block_nums(1, 1, block_num_y);
10569
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10570
  {
 
 
10571
 
10572
  stream->submit([&](sycl::handler &cgh) {
 
 
10573
 
10574
  cgh.parallel_for(
10575
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10577
  [[intel::reqd_sub_group_size(32)]] {
10578
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
10579
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
10580
+ vx, vy, dst, ncols, nrows, item_ct1);
 
10581
  });
10582
  });
10583
  }
 
10592
  const sycl::range<3> block_nums(1, 1, block_num_y);
10593
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10594
  {
 
 
10595
 
10596
  stream->submit([&](sycl::handler &cgh) {
 
 
10597
 
10598
  cgh.parallel_for(
10599
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10601
  [[intel::reqd_sub_group_size(32)]] {
10602
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
10603
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
10604
+ vx, vy, dst, ncols, nrows, item_ct1);
 
10605
  });
10606
  });
10607
  }
 
10616
  const sycl::range<3> block_nums(1, 1, block_num_y);
10617
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10618
  {
 
 
10619
 
10620
  stream->submit([&](sycl::handler &cgh) {
 
 
10621
 
10622
  cgh.parallel_for(
10623
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10625
  [[intel::reqd_sub_group_size(32)]] {
10626
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
10627
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
10628
+ vx, vy, dst, ncols, nrows, item_ct1);
 
10629
  });
10630
  });
10631
  }
10632
  }
10633
 
10634
+
10635
  static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10636
  float *dst, const int ncols,
10637
  const int nrows,
 
10641
  const sycl::range<3> block_nums(1, 1, block_num_y);
10642
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10643
  {
 
 
 
 
10644
 
10645
  stream->submit([&](sycl::handler &cgh) {
10646
+ auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
10647
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10648
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10649
 
10650
  cgh.parallel_for(
10651
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10668
  const sycl::range<3> block_nums(1, 1, block_num_y);
10669
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10670
  {
 
 
10671
 
10672
  stream->submit([&](sycl::handler &cgh) {
10673
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
10674
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
10675
 
10676
  cgh.parallel_for(
10677
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10694
  const sycl::range<3> block_nums(1, 1, block_num_y);
10695
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10696
  {
 
 
10697
 
10698
  stream->submit([&](sycl::handler &cgh) {
10699
+ auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
10700
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
10701
 
10702
  cgh.parallel_for(
10703
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10720
  const sycl::range<3> block_nums(1, 1, block_num_y);
10721
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10722
  {
 
 
10723
 
10724
  stream->submit([&](sycl::handler &cgh) {
10725
+ auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
10726
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
10727
 
10728
  cgh.parallel_for(
10729
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
 
10746
  const sycl::range<3> block_nums(1, 1, block_num_y);
10747
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10748
  {
 
 
10749
 
10750
  stream->submit([&](sycl::handler &cgh) {
10751
+ auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
10752
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
10753
 
10754
  cgh.parallel_for(
10755
  sycl::nd_range<3>(block_nums * block_dims, block_dims),