Alberto Cabrera Pérez romain.biessy commited on
Commit
4576ce0
·
1 Parent(s): 507d30c

sycl : implementation of reordered Q4_0 MMVQ for Intel GPUs (llama/12858)

Browse files

* sycl : Implemented reorder Q4_0 mmvq

Signed-off-by: Alberto Cabrera <[email protected]>

* sycl : Fixed mmvq being called when reorder is disabled

* sycl : Improved comments in the quants header

Signed-off-by: Alberto Cabrera <[email protected]>

* Use static_assert

* safe_div -> ceil_div

* Clarify qi comment

* change the reorder tensor from init to execute OP

* dbg

* Undo changes to test-backend-ops

* Refactor changes on top of q4_0 reorder fix

* Missing Reverts

* Refactored opt_for_reorder logic to simplify code path

* Explicit inlining and unroll

* Renamed mul_mat_algo enum for consistency

---------

Signed-off-by: Alberto Cabrera <[email protected]>
Co-authored-by: romain.biessy <[email protected]>

ggml/src/ggml-sycl/backend.hpp CHANGED
@@ -14,23 +14,24 @@
14
  #define GGML_SYCL_BACKEND_HPP
15
 
16
  #include "binbcast.hpp"
17
- #include "concat.hpp"
18
  #include "common.hpp"
 
19
  #include "conv.hpp"
20
  #include "convert.hpp"
 
21
  #include "dequantize.hpp"
22
  #include "dmmv.hpp"
 
 
 
23
  #include "mmq.hpp"
24
  #include "mmvq.hpp"
25
- #include "rope.hpp"
26
  #include "norm.hpp"
 
 
 
27
  #include "softmax.hpp"
28
  #include "tsembd.hpp"
29
- #include "im2col.hpp"
30
  #include "wkv.hpp"
31
- #include "outprod.hpp"
32
- #include "element_wise.hpp"
33
- #include "cpy.hpp"
34
- #include "gla.hpp"
35
 
36
- #endif // GGML_SYCL_BACKEND_HPP
 
14
  #define GGML_SYCL_BACKEND_HPP
15
 
16
  #include "binbcast.hpp"
 
17
  #include "common.hpp"
18
+ #include "concat.hpp"
19
  #include "conv.hpp"
20
  #include "convert.hpp"
21
+ #include "cpy.hpp"
22
  #include "dequantize.hpp"
23
  #include "dmmv.hpp"
24
+ #include "element_wise.hpp"
25
+ #include "gla.hpp"
26
+ #include "im2col.hpp"
27
  #include "mmq.hpp"
28
  #include "mmvq.hpp"
 
29
  #include "norm.hpp"
30
+ #include "outprod.hpp"
31
+ #include "quants.hpp"
32
+ #include "rope.hpp"
33
  #include "softmax.hpp"
34
  #include "tsembd.hpp"
 
35
  #include "wkv.hpp"
 
 
 
 
36
 
37
+ #endif // GGML_SYCL_BACKEND_HPP
ggml/src/ggml-sycl/common.hpp CHANGED
@@ -42,6 +42,7 @@ void ggml_sycl_host_free(void* ptr);
42
 
43
  extern int g_ggml_sycl_debug;
44
  extern int g_ggml_sycl_disable_optimize;
 
45
 
46
  #define GGML_SYCL_DEBUG(...) \
47
  do { \
 
42
 
43
  extern int g_ggml_sycl_debug;
44
  extern int g_ggml_sycl_disable_optimize;
45
+ extern int g_ggml_sycl_prioritize_dmmv;
46
 
47
  #define GGML_SYCL_DEBUG(...) \
48
  do { \
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -49,6 +49,7 @@ static bool g_sycl_loaded = false;
49
  int g_ggml_sycl_debug = 0;
50
  int g_ggml_sycl_disable_optimize = 0;
51
  int g_ggml_sycl_disable_graph = 0;
 
52
 
53
  static ggml_sycl_device_info ggml_sycl_init() {
54
  ggml_sycl_device_info info = {};
@@ -195,11 +196,13 @@ static void ggml_check_sycl() try {
195
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
196
  g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
197
  g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
 
198
  GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
199
  GGML_LOG_INFO("Running with Environment Variables:\n");
200
  GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
201
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
202
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
 
203
  GGML_LOG_INFO("Build with Macros:\n");
204
  #if defined(GGML_SYCL_FORCE_MMQ)
205
  GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
@@ -2822,12 +2825,45 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2822
  std::exit(1);
2823
  }
2824
 
 
 
 
 
 
 
2825
  inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
2826
  // TODO: accuracy issues in MMQ
2827
  GGML_UNUSED(type);
2828
  return false;
2829
  }
2830
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2831
  static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
2832
  switch (type) {
2833
  case GGML_TYPE_Q4_0:
@@ -2856,7 +2892,7 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows,
2856
  GGML_ASSERT((size % sizeof(block_q4_0) == 0));
2857
  GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
2858
  int offset_blks = offset / sizeof(block_q4_0);
2859
- auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
2860
  auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
2861
 
2862
  stream->parallel_for(
@@ -2884,25 +2920,44 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
2884
  reorder_qw(data_device, ncols, nrows, size, 0, stream);
2885
  }
2886
 
2887
- /*
2888
- * This function could be called when the OP (mul_mat) function support reorder optimizition.
2889
- */
2890
- static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1,
2891
- ggml_tensor * dst) {
2892
- if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
2893
- ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
2894
- dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
2895
- src0->type == GGML_TYPE_Q4_0 &&
2896
- src1->ne[2]==1 && src1->ne[3]==1) {
2897
 
2898
- ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
2899
- if (!extra) return; //only happen in CI/UT permute case.
 
 
 
2900
 
2901
- if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder.
 
 
 
2902
 
2903
- reorder_qw(src0, ctx->stream());
2904
- extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2905
  }
 
 
 
2906
  }
2907
 
2908
  static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2911,7 +2966,8 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
2911
  int64_t min_compute_capability = INT_MAX;
2912
 
2913
  if (split) {
2914
- ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
 
2915
  auto & tensor_split = buft_ctx->tensor_split;
2916
  for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
2917
  // skip devices that are not going to do any work:
@@ -2924,7 +2980,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
2924
  }
2925
  }
2926
  } else {
2927
- min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
2928
  }
2929
 
2930
  // check data types and tensor shapes for custom matrix multiplication kernels:
@@ -2946,9 +3002,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
2946
  use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
2947
  #endif // SYCL_USE_XMX
2948
 
 
2949
  // mmvq path is faster in the CUDA backend.
2950
- if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
 
 
 
 
2951
  use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
 
2952
 
2953
  if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
2954
  // TODO: Refactor and cleanup of mul mat dispatching.
@@ -2967,17 +3029,23 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
2967
  // KQ + KQV multi-batch
2968
  ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
2969
  } else if (use_dequantize_mul_mat_vec) {
2970
- opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
2971
- ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
2972
- // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
2973
  } else if (use_mul_mat_vec_q) {
2974
- ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
 
 
2975
  } else if (use_mul_mat_q) {
2976
- ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
 
2977
  } else {
2978
- opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
2979
- ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
 
 
2980
  }
 
2981
  }
2982
 
2983
 
 
49
  int g_ggml_sycl_debug = 0;
50
  int g_ggml_sycl_disable_optimize = 0;
51
  int g_ggml_sycl_disable_graph = 0;
52
+ int g_ggml_sycl_prioritize_dmmv = 0;
53
 
54
  static ggml_sycl_device_info ggml_sycl_init() {
55
  ggml_sycl_device_info info = {};
 
196
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
197
  g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
198
  g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
199
+ g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
200
  GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
201
  GGML_LOG_INFO("Running with Environment Variables:\n");
202
  GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
203
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
204
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
205
+ GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
206
  GGML_LOG_INFO("Build with Macros:\n");
207
  #if defined(GGML_SYCL_FORCE_MMQ)
208
  GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
 
2825
  std::exit(1);
2826
  }
2827
 
2828
+ enum class mul_mat_algo {
2829
+ DMMV = 0,
2830
+ MMVQ = 1,
2831
+ MUL_MAT_SYCL = 2,
2832
+ };
2833
+
2834
  inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
2835
  // TODO: accuracy issues in MMQ
2836
  GGML_UNUSED(type);
2837
  return false;
2838
  }
2839
 
2840
+ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
2841
+ switch (type) {
2842
+ case GGML_TYPE_Q4_0:
2843
+ return true;
2844
+ default:
2845
+ return false;
2846
+ }
2847
+ }
2848
+
2849
+ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
2850
+ switch (type) {
2851
+ case GGML_TYPE_Q4_0:
2852
+ return true;
2853
+ default:
2854
+ return false;
2855
+ }
2856
+ }
2857
+
2858
+ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
2859
+ switch (type) {
2860
+ case GGML_TYPE_Q4_0:
2861
+ return true;
2862
+ default:
2863
+ return false;
2864
+ }
2865
+ }
2866
+
2867
  static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
2868
  switch (type) {
2869
  case GGML_TYPE_Q4_0:
 
2892
  GGML_ASSERT((size % sizeof(block_q4_0) == 0));
2893
  GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
2894
  int offset_blks = offset / sizeof(block_q4_0);
2895
+ auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;
2896
  auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
2897
 
2898
  stream->parallel_for(
 
2920
  reorder_qw(data_device, ncols, nrows, size, 0, stream);
2921
  }
2922
 
2923
+ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
2924
+ return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
2925
+ ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
2926
+ dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
2927
+ dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
2928
+ }
 
 
 
 
2929
 
2930
+ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
2931
+ ggml_tensor * dst, mul_mat_algo mm_algorithm) {
2932
+ if (!should_reorder_tensor(*ctx, dst)) {
2933
+ return;
2934
+ }
2935
 
2936
+ ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
2937
+ if (!extra || extra->optimized_feature.reorder) {
2938
+ return; // Skip permutations and already reordered tensors
2939
+ }
2940
 
2941
+ switch (mm_algorithm) {
2942
+ case mul_mat_algo::DMMV:
2943
+ if (!ggml_sycl_supports_reorder_dmmv(src0->type)) {
2944
+ return;
2945
+ }
2946
+ break;
2947
+ case mul_mat_algo::MMVQ:
2948
+ if (!ggml_sycl_supports_reorder_mmvq(src0->type)) {
2949
+ return;
2950
+ }
2951
+ break;
2952
+ case mul_mat_algo::MUL_MAT_SYCL:
2953
+ if (!ggml_sycl_supports_reorder_mul_mat_sycl(src0->type)) {
2954
+ return;
2955
+ }
2956
+ break;
2957
  }
2958
+
2959
+ reorder_qw(src0, ctx->stream());
2960
+ extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
2961
  }
2962
 
2963
  static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
2966
  int64_t min_compute_capability = INT_MAX;
2967
 
2968
  if (split) {
2969
+ ggml_backend_sycl_split_buffer_type_context * buft_ctx =
2970
+ (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
2971
  auto & tensor_split = buft_ctx->tensor_split;
2972
  for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
2973
  // skip devices that are not going to do any work:
 
2980
  }
2981
  }
2982
  } else {
2983
+ min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
2984
  }
2985
 
2986
  // check data types and tensor shapes for custom matrix multiplication kernels:
 
3002
  use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
3003
  #endif // SYCL_USE_XMX
3004
 
3005
+
3006
  // mmvq path is faster in the CUDA backend.
3007
+ if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
3008
+ // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
3009
+ // is enabled takes precedence over DMMV, the current if-else implementation
3010
+ // requires disabling DMMV if both conditions are met
3011
+ || (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) {
3012
  use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
3013
+ }
3014
 
3015
  if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
3016
  // TODO: Refactor and cleanup of mul mat dispatching.
 
3029
  // KQ + KQV multi-batch
3030
  ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
3031
  } else if (use_dequantize_mul_mat_vec) {
3032
+ constexpr bool convert_src1_to_q8_1 = false;
3033
+ opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::DMMV);
3034
+ ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1);
3035
  } else if (use_mul_mat_vec_q) {
3036
+ constexpr bool convert_src1_to_q8_1 = true;
3037
+ opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MMVQ);
3038
+ ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1);
3039
  } else if (use_mul_mat_q) {
3040
+ constexpr bool convert_src1_to_q8_1 = true;
3041
+ ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
3042
  } else {
3043
+ constexpr bool convert_src1_to_q8_1 = false;
3044
+ // MUL_MAT_SYCL supports reorder
3045
+ opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MUL_MAT_SYCL);
3046
+ ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
3047
  }
3048
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
3049
  }
3050
 
3051
 
ggml/src/ggml-sycl/mmvq.cpp CHANGED
@@ -1,6 +1,60 @@
1
  #include "mmvq.hpp"
 
 
 
 
2
  #include "vecdotq.hpp"
3
- #include <cassert>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
6
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
@@ -480,26 +534,39 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
480
  }
481
  }
482
 
483
- static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
484
- float *dst, const int ncols,
485
- const int nrows,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  dpct::queue_ptr stream) {
487
  GGML_ASSERT(ncols % QK4_0 == 0);
488
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
489
  const sycl::range<3> block_nums(1, 1, block_num_y);
490
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
491
- {
492
-
493
- stream->submit([&](sycl::handler &cgh) {
494
 
495
- cgh.parallel_for(
496
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
497
- [=](sycl::nd_item<3> item_ct1)
498
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
499
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
500
- VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
501
- vx, vy, dst, ncols, nrows, item_ct1);
502
- });
503
  });
504
  }
505
  }
@@ -916,93 +983,95 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
916
  }
917
  }
918
 
919
- void ggml_sycl_op_mul_mat_vec_q(
920
- ggml_backend_sycl_context & ctx,
921
- const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
922
- const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
923
- float *dst_dd_i, const int64_t row_low, const int64_t row_high,
924
- const int64_t src1_ncols, const int64_t src1_padded_col_size,
925
- const dpct::queue_ptr &stream) {
926
-
927
  const int64_t ne10 = src1->ne[0];
928
  GGML_ASSERT(ne10 % QK8_1 == 0);
929
 
930
- const int64_t ne00 = src0->ne[0];
931
  const int64_t row_diff = row_high - row_low;
932
 
933
  int id;
934
- SYCL_CHECK(
935
- CHECK_TRY_ERROR(id = get_current_device_id()));
936
  const size_t q8_1_ts = sizeof(block_q8_1);
937
  const size_t q8_1_bs = QK8_1;
938
  // the main device has a larger memory buffer to hold the results from all GPUs
939
  // nrows_dst == nrows of the matrix that the kernel writes into
940
 
941
- for (int i = 0; i < src1_ncols; i++)
942
- {
943
  const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
944
- const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
945
- float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
946
  switch (src0->type) {
947
- case GGML_TYPE_Q4_0:
948
- mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
949
- break;
950
- case GGML_TYPE_Q4_1:
951
- mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
952
- break;
953
- case GGML_TYPE_Q5_0:
954
- mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
955
- break;
956
- case GGML_TYPE_Q5_1:
957
- mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
958
- break;
959
- case GGML_TYPE_Q8_0:
960
- mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
961
- break;
962
- case GGML_TYPE_Q2_K:
963
- mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
964
- break;
965
- case GGML_TYPE_Q3_K:
966
- mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
967
- break;
968
- case GGML_TYPE_Q4_K:
969
- mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
970
- break;
971
- case GGML_TYPE_Q5_K:
972
- mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
973
- break;
974
- case GGML_TYPE_Q6_K:
975
- mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
976
- break;
977
- case GGML_TYPE_IQ1_S:
978
- mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
979
- break;
980
- case GGML_TYPE_IQ1_M:
981
- mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
982
- break;
983
- case GGML_TYPE_IQ2_XXS:
984
- mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
985
- break;
986
- case GGML_TYPE_IQ2_XS:
987
- mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
988
- break;
989
- case GGML_TYPE_IQ2_S:
990
- mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
991
- break;
992
- case GGML_TYPE_IQ3_XXS:
993
- mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
994
- break;
995
- case GGML_TYPE_IQ3_S:
996
- mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
997
- break;
998
- case GGML_TYPE_IQ4_NL:
999
- mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1000
- break;
1001
- case GGML_TYPE_IQ4_XS:
1002
- mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1003
- break;
1004
- default:
1005
- GGML_ABORT("fatal error");
 
 
 
 
 
 
 
1006
  }
1007
  }
1008
  GGML_UNUSED(src1);
 
1
  #include "mmvq.hpp"
2
+
3
+ #include "ggml.h"
4
+ #include "common.hpp"
5
+ #include "quants.hpp"
6
  #include "vecdotq.hpp"
7
+
8
+ template <typename reorder_vec_dot_q_sycl>
9
+ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
10
+ const int ncols, const int nrows, const sycl::nd_item<3> & nd_item) {
11
+ using block_type = ggml_sycl_reordered::block_q_t<reorder_vec_dot_q_sycl::gtype>;
12
+ using block_traits = typename block_type::traits;
13
+
14
+ const auto sg = nd_item.get_sub_group();
15
+ const int sg_range = sg.get_group_linear_range();
16
+ const int workgroup_id = nd_item.get_group_linear_id();
17
+ const int sg_id = sg.get_group_linear_id();
18
+ const int row = workgroup_id * sg_range + sg_id;
19
+
20
+ if (row >= nrows) {
21
+ return;
22
+ }
23
+
24
+ const int blocks_per_row = ncols / block_traits::qk;
25
+ constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
26
+ constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
27
+
28
+ static_assert(blocks_per_subgroup > 0);
29
+ static_assert(block_elements_per_subgroup > 0);
30
+
31
+ const block_q8_1 * y = (const block_q8_1 *) vy;
32
+
33
+ float partial_sum = 0.0f;
34
+ for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
35
+ const int ibx = row * blocks_per_row + i; // x block index
36
+ // TODO: Generalize offsets, right now only works for quantizations that don't split high and low bits
37
+ const int bx_offset = block_type::get_block_offset(ibx);
38
+ const int d_offset = block_type::get_d_offset(nrows, ncols, ibx);
39
+
40
+ // Y block index that aligns with ibx
41
+ const int iby = i * block_type::block_to_q8_1_ratio();
42
+
43
+ #pragma unroll
44
+ for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
45
+ // x block quant index when casting the quants to int
46
+ const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
47
+
48
+ partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs);
49
+ }
50
+ }
51
+
52
+ auto sum = sycl::reduce_over_group(nd_item.get_sub_group(), partial_sum, std::plus<>());
53
+
54
+ if (sg.leader()) {
55
+ dst[row] = sum;
56
+ }
57
+ }
58
 
59
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
60
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 
534
  }
535
  }
536
 
537
+ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
538
+ const int nrows, dpct::queue_ptr stream) {
539
+ GGML_ASSERT(ncols % QK4_0 == 0);
540
+ const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
541
+ constexpr size_t num_subgroups = 16;
542
+ GGML_ASSERT(block_num_y % num_subgroups == 0);
543
+
544
+ const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
545
+ const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
546
+
547
+ stream->submit([&](sycl::handler & cgh) {
548
+ cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
549
+ [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
550
+ mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
551
+ nd_item);
552
+ });
553
+ });
554
+ }
555
+
556
+ static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows,
557
  dpct::queue_ptr stream) {
558
  GGML_ASSERT(ncols % QK4_0 == 0);
559
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
560
  const sycl::range<3> block_nums(1, 1, block_num_y);
561
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 
 
 
562
 
563
+ {
564
+ stream->submit([&](sycl::handler & cgh) {
565
+ cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
566
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
567
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
568
+ vx, vy, dst, ncols, nrows, item_ct1);
569
+ });
 
570
  });
571
  }
572
  }
 
983
  }
984
  }
985
 
986
+ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
987
+ ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
988
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low,
989
+ const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_col_size,
990
+ const dpct::queue_ptr & stream) {
 
 
 
991
  const int64_t ne10 = src1->ne[0];
992
  GGML_ASSERT(ne10 % QK8_1 == 0);
993
 
994
+ const int64_t ne00 = src0->ne[0];
995
  const int64_t row_diff = row_high - row_low;
996
 
997
  int id;
998
+ SYCL_CHECK(CHECK_TRY_ERROR(id = get_current_device_id()));
 
999
  const size_t q8_1_ts = sizeof(block_q8_1);
1000
  const size_t q8_1_bs = QK8_1;
1001
  // the main device has a larger memory buffer to hold the results from all GPUs
1002
  // nrows_dst == nrows of the matrix that the kernel writes into
1003
 
1004
+ for (int i = 0; i < src1_ncols; i++) {
 
1005
  const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
1006
+ const char * src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
1007
+ float * dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
1008
  switch (src0->type) {
1009
+ case GGML_TYPE_Q4_0:
1010
+ if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1011
+ ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1012
+ GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n");
1013
+ reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1014
+ } else {
1015
+ GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl\n");
1016
+ mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1017
+ }
1018
+ break;
1019
+ case GGML_TYPE_Q4_1:
1020
+ mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1021
+ break;
1022
+ case GGML_TYPE_Q5_0:
1023
+ mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1024
+ break;
1025
+ case GGML_TYPE_Q5_1:
1026
+ mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1027
+ break;
1028
+ case GGML_TYPE_Q8_0:
1029
+ mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1030
+ break;
1031
+ case GGML_TYPE_Q2_K:
1032
+ mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1033
+ break;
1034
+ case GGML_TYPE_Q3_K:
1035
+ mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1036
+ break;
1037
+ case GGML_TYPE_Q4_K:
1038
+ mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1039
+ break;
1040
+ case GGML_TYPE_Q5_K:
1041
+ mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1042
+ break;
1043
+ case GGML_TYPE_Q6_K:
1044
+ mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1045
+ break;
1046
+ case GGML_TYPE_IQ1_S:
1047
+ mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1048
+ break;
1049
+ case GGML_TYPE_IQ1_M:
1050
+ mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1051
+ break;
1052
+ case GGML_TYPE_IQ2_XXS:
1053
+ mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1054
+ break;
1055
+ case GGML_TYPE_IQ2_XS:
1056
+ mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1057
+ break;
1058
+ case GGML_TYPE_IQ2_S:
1059
+ mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1060
+ break;
1061
+ case GGML_TYPE_IQ3_XXS:
1062
+ mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1063
+ break;
1064
+ case GGML_TYPE_IQ3_S:
1065
+ mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1066
+ break;
1067
+ case GGML_TYPE_IQ4_NL:
1068
+ mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1069
+ break;
1070
+ case GGML_TYPE_IQ4_XS:
1071
+ mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1072
+ break;
1073
+ default:
1074
+ GGML_ABORT("fatal error");
1075
  }
1076
  }
1077
  GGML_UNUSED(src1);
ggml/src/ggml-sycl/quants.hpp ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2025 Codeplay Software Ltd.
4
+ // Copyright (C) 2025 Intel Corporation
5
+ // SPDX-License-Identifier: MIT
6
+ //
7
+
8
+ //
9
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
10
+ // See https://llvm.org/LICENSE.txt for license information.
11
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
12
+ //
13
+
14
+ #ifndef GGML_SYCL_QUANTS_HPP
15
+ #define GGML_SYCL_QUANTS_HPP
16
+
17
+ #include "ggml-common.h"
18
+ #include "ggml.h"
19
+
20
+ namespace ggml_sycl_reordered {
21
+
22
+
23
+ // The reordered block moves quants (qs) and scales(d) to two
24
+ // uniform regions of memory that is contiguous in the same tensor.
25
+ // What this means is that instead of having:
26
+ // [d0, qs0] [d1, qs1] [d2, qs2] ... [dN, qsN]
27
+ // We have:
28
+ // [qs0, qs1, qs2, ..., qsN] [d0, d1, d2, ..., dN]
29
+ //
30
+ // Notes: out-of-bounds qs will run into d values
31
+ // Aligment relies on the allocated size of qs
32
+
33
+ template <ggml_type type> struct block_q_t;
34
+
35
+
36
+ // qk number of weights / quants in a block
37
+ // qr number of weights in a byte (described as 'before dequantization')
38
+ // for quantization types that has low and high bits split, qr is calculated with
39
+ // using the lower bits, e.g for Q6 quants QR6 is 2
40
+ // qi number of 32 bit integers needed to represent all the quants from a block (`qs` field)
41
+ // See ggml-common.h to see how these are calculated
42
+ template <> struct block_q_t<GGML_TYPE_Q4_0> {
43
+ struct traits {
44
+ static constexpr uint32_t qk = QK4_0;
45
+ static constexpr uint32_t qi = QI4_0;
46
+ static constexpr uint32_t qr = QR4_0;
47
+ static constexpr uint32_t vdr_mmvq = 2;
48
+ };
49
+
50
+ static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
51
+
52
+ static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
53
+ return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half);
54
+ }
55
+
56
+ static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
57
+ };
58
+
59
+ } // namespace ggml_sycl_reordered
60
+
61
+ #endif // GGML_SYCL_QUANTS_HPP
ggml/src/ggml-sycl/vecdotq.hpp CHANGED
@@ -1,6 +1,6 @@
1
  //
2
  // MIT license
3
- // Copyright (C) 2024 Intel Corporation
4
  // SPDX-License-Identifier: MIT
5
  //
6
 
@@ -14,8 +14,11 @@
14
  #define GGML_SYCL_VECDOTQ_HPP
15
 
16
  #include "dpct/helper.hpp"
 
 
17
 
18
- typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
 
19
 
20
  static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
21
  const uint16_t* x16 =
@@ -252,13 +255,60 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
252
  // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
253
  // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  #define VDR_Q4_0_Q8_1_MMVQ 2
256
  #define VDR_Q4_0_Q8_1_MMQ 4
257
 
258
  template <int vdr>
259
- static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
260
- const float &d4,
261
- const sycl::half2 &ds8) {
262
  int sumi = 0;
263
  #pragma unroll
264
  for (int i = 0; i < vdr; ++i) {
@@ -270,8 +320,7 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
270
  sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
271
  }
272
 
273
- const sycl::float2 ds8f =
274
- ds8.convert<float, sycl::rounding_mode::automatic>();
275
 
276
  // second part effectively subtracts 8 from each quant value
277
  return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
@@ -456,13 +505,13 @@ vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
456
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
457
 
458
  int v[VDR_Q4_0_Q8_1_MMVQ];
459
- int u[2*VDR_Q4_0_Q8_1_MMVQ];
460
 
461
  #pragma unroll
462
  for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
463
- v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
464
- u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
465
- u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
466
  }
467
 
468
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
 
1
  //
2
  // MIT license
3
+ // Copyright (C) 2025 Intel Corporation
4
  // SPDX-License-Identifier: MIT
5
  //
6
 
 
14
  #define GGML_SYCL_VECDOTQ_HPP
15
 
16
  #include "dpct/helper.hpp"
17
+ #include "ggml.h"
18
+ #include "quants.hpp"
19
 
20
+ typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
21
+ const int & iqs);
22
 
23
  static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
24
  const uint16_t* x16 =
 
255
  // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
256
  // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
257
 
258
+ template <ggml_type T> struct reorder_vec_dot_q_sycl {
259
+ static_assert(T != T, "ggml_type for reorder vecdot not implemented");
260
+ };
261
+
262
+ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
263
+ static constexpr ggml_type gtype = GGML_TYPE_Q4_0;
264
+
265
+ using q4_0_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_0>;
266
+ using q4_0_traits = typename q4_0_block::traits;
267
+
268
+ __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4, const sycl::half2 & ds8) {
269
+ int sumi = 0;
270
+
271
+ #pragma unroll
272
+ for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
273
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
274
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
275
+
276
+ // SIMD dot product of quantized values
277
+ sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
278
+ sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
279
+ }
280
+
281
+ const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
282
+
283
+ // second part effectively subtracts 8 from each quant value
284
+ return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y());
285
+ }
286
+
287
+ __dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
288
+ const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
289
+ const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
290
+ const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
291
+ int v[q4_0_traits::vdr_mmvq];
292
+ int u[2 * q4_0_traits::vdr_mmvq];
293
+
294
+ #pragma unroll
295
+
296
+ for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
297
+ v[i] = get_int_from_uint8(bq4_0, iqs + i);
298
+ u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
299
+ u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + q4_0_traits::qi);
300
+ }
301
+
302
+ return vec_dot_q4_0_q8_1_impl(v, u, d, bq8_1->ds);
303
+ };
304
+ };
305
+
306
  #define VDR_Q4_0_Q8_1_MMVQ 2
307
  #define VDR_Q4_0_Q8_1_MMQ 4
308
 
309
  template <int vdr>
310
+ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4,
311
+ const sycl::half2 & ds8) {
 
312
  int sumi = 0;
313
  #pragma unroll
314
  for (int i = 0; i < vdr; ++i) {
 
320
  sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
321
  }
322
 
323
+ const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
 
324
 
325
  // second part effectively subtracts 8 from each quant value
326
  return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
 
505
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
506
 
507
  int v[VDR_Q4_0_Q8_1_MMVQ];
508
+ int u[2 * VDR_Q4_0_Q8_1_MMVQ];
509
 
510
  #pragma unroll
511
  for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
512
+ v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
513
+ u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
514
+ u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
515
  }
516
 
517
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);