Spaces:
Sleeping
Sleeping
Alberto Cabrera Pérez
romain.biessy
commited on
Commit
·
4576ce0
1
Parent(s):
507d30c
sycl : implementation of reordered Q4_0 MMVQ for Intel GPUs (llama/12858)
Browse files* sycl : Implemented reorder Q4_0 mmvq
Signed-off-by: Alberto Cabrera <[email protected]>
* sycl : Fixed mmvq being called when reorder is disabled
* sycl : Improved comments in the quants header
Signed-off-by: Alberto Cabrera <[email protected]>
* Use static_assert
* safe_div -> ceil_div
* Clarify qi comment
* change the reorder tensor from init to execute OP
* dbg
* Undo changes to test-backend-ops
* Refactor changes on top of q4_0 reorder fix
* Missing Reverts
* Refactored opt_for_reorder logic to simplify code path
* Explicit inlining and unroll
* Renamed mul_mat_algo enum for consistency
---------
Signed-off-by: Alberto Cabrera <[email protected]>
Co-authored-by: romain.biessy <[email protected]>
- ggml/src/ggml-sycl/backend.hpp +9 -8
- ggml/src/ggml-sycl/common.hpp +1 -0
- ggml/src/ggml-sycl/ggml-sycl.cpp +94 -26
- ggml/src/ggml-sycl/mmvq.cpp +158 -89
- ggml/src/ggml-sycl/quants.hpp +61 -0
- ggml/src/ggml-sycl/vecdotq.hpp +60 -11
ggml/src/ggml-sycl/backend.hpp
CHANGED
|
@@ -14,23 +14,24 @@
|
|
| 14 |
#define GGML_SYCL_BACKEND_HPP
|
| 15 |
|
| 16 |
#include "binbcast.hpp"
|
| 17 |
-
#include "concat.hpp"
|
| 18 |
#include "common.hpp"
|
|
|
|
| 19 |
#include "conv.hpp"
|
| 20 |
#include "convert.hpp"
|
|
|
|
| 21 |
#include "dequantize.hpp"
|
| 22 |
#include "dmmv.hpp"
|
|
|
|
|
|
|
|
|
|
| 23 |
#include "mmq.hpp"
|
| 24 |
#include "mmvq.hpp"
|
| 25 |
-
#include "rope.hpp"
|
| 26 |
#include "norm.hpp"
|
|
|
|
|
|
|
|
|
|
| 27 |
#include "softmax.hpp"
|
| 28 |
#include "tsembd.hpp"
|
| 29 |
-
#include "im2col.hpp"
|
| 30 |
#include "wkv.hpp"
|
| 31 |
-
#include "outprod.hpp"
|
| 32 |
-
#include "element_wise.hpp"
|
| 33 |
-
#include "cpy.hpp"
|
| 34 |
-
#include "gla.hpp"
|
| 35 |
|
| 36 |
-
#endif
|
|
|
|
| 14 |
#define GGML_SYCL_BACKEND_HPP
|
| 15 |
|
| 16 |
#include "binbcast.hpp"
|
|
|
|
| 17 |
#include "common.hpp"
|
| 18 |
+
#include "concat.hpp"
|
| 19 |
#include "conv.hpp"
|
| 20 |
#include "convert.hpp"
|
| 21 |
+
#include "cpy.hpp"
|
| 22 |
#include "dequantize.hpp"
|
| 23 |
#include "dmmv.hpp"
|
| 24 |
+
#include "element_wise.hpp"
|
| 25 |
+
#include "gla.hpp"
|
| 26 |
+
#include "im2col.hpp"
|
| 27 |
#include "mmq.hpp"
|
| 28 |
#include "mmvq.hpp"
|
|
|
|
| 29 |
#include "norm.hpp"
|
| 30 |
+
#include "outprod.hpp"
|
| 31 |
+
#include "quants.hpp"
|
| 32 |
+
#include "rope.hpp"
|
| 33 |
#include "softmax.hpp"
|
| 34 |
#include "tsembd.hpp"
|
|
|
|
| 35 |
#include "wkv.hpp"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
#endif // GGML_SYCL_BACKEND_HPP
|
ggml/src/ggml-sycl/common.hpp
CHANGED
|
@@ -42,6 +42,7 @@ void ggml_sycl_host_free(void* ptr);
|
|
| 42 |
|
| 43 |
extern int g_ggml_sycl_debug;
|
| 44 |
extern int g_ggml_sycl_disable_optimize;
|
|
|
|
| 45 |
|
| 46 |
#define GGML_SYCL_DEBUG(...) \
|
| 47 |
do { \
|
|
|
|
| 42 |
|
| 43 |
extern int g_ggml_sycl_debug;
|
| 44 |
extern int g_ggml_sycl_disable_optimize;
|
| 45 |
+
extern int g_ggml_sycl_prioritize_dmmv;
|
| 46 |
|
| 47 |
#define GGML_SYCL_DEBUG(...) \
|
| 48 |
do { \
|
ggml/src/ggml-sycl/ggml-sycl.cpp
CHANGED
|
@@ -49,6 +49,7 @@ static bool g_sycl_loaded = false;
|
|
| 49 |
int g_ggml_sycl_debug = 0;
|
| 50 |
int g_ggml_sycl_disable_optimize = 0;
|
| 51 |
int g_ggml_sycl_disable_graph = 0;
|
|
|
|
| 52 |
|
| 53 |
static ggml_sycl_device_info ggml_sycl_init() {
|
| 54 |
ggml_sycl_device_info info = {};
|
|
@@ -195,11 +196,13 @@ static void ggml_check_sycl() try {
|
|
| 195 |
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
| 196 |
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
|
| 197 |
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
|
|
|
| 198 |
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
| 199 |
GGML_LOG_INFO("Running with Environment Variables:\n");
|
| 200 |
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
| 201 |
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
|
| 202 |
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
|
|
|
|
| 203 |
GGML_LOG_INFO("Build with Macros:\n");
|
| 204 |
#if defined(GGML_SYCL_FORCE_MMQ)
|
| 205 |
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
|
|
@@ -2822,12 +2825,45 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
| 2822 |
std::exit(1);
|
| 2823 |
}
|
| 2824 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2825 |
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
| 2826 |
// TODO: accuracy issues in MMQ
|
| 2827 |
GGML_UNUSED(type);
|
| 2828 |
return false;
|
| 2829 |
}
|
| 2830 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2831 |
static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
| 2832 |
switch (type) {
|
| 2833 |
case GGML_TYPE_Q4_0:
|
|
@@ -2856,7 +2892,7 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
|
| 2856 |
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
| 2857 |
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
| 2858 |
int offset_blks = offset / sizeof(block_q4_0);
|
| 2859 |
-
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2
|
| 2860 |
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
| 2861 |
|
| 2862 |
stream->parallel_for(
|
|
@@ -2884,25 +2920,44 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
| 2884 |
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
| 2885 |
}
|
| 2886 |
|
| 2887 |
-
|
| 2888 |
-
|
| 2889 |
-
|
| 2890 |
-
|
| 2891 |
-
|
| 2892 |
-
|
| 2893 |
-
ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
|
| 2894 |
-
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
|
| 2895 |
-
src0->type == GGML_TYPE_Q4_0 &&
|
| 2896 |
-
src1->ne[2]==1 && src1->ne[3]==1) {
|
| 2897 |
|
| 2898 |
-
|
| 2899 |
-
|
|
|
|
|
|
|
|
|
|
| 2900 |
|
| 2901 |
-
|
|
|
|
|
|
|
|
|
|
| 2902 |
|
| 2903 |
-
|
| 2904 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2905 |
}
|
|
|
|
|
|
|
|
|
|
| 2906 |
}
|
| 2907 |
|
| 2908 |
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -2911,7 +2966,8 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 2911 |
int64_t min_compute_capability = INT_MAX;
|
| 2912 |
|
| 2913 |
if (split) {
|
| 2914 |
-
ggml_backend_sycl_split_buffer_type_context * buft_ctx =
|
|
|
|
| 2915 |
auto & tensor_split = buft_ctx->tensor_split;
|
| 2916 |
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
| 2917 |
// skip devices that are not going to do any work:
|
|
@@ -2924,7 +2980,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 2924 |
}
|
| 2925 |
}
|
| 2926 |
} else {
|
| 2927 |
-
min_compute_capability
|
| 2928 |
}
|
| 2929 |
|
| 2930 |
// check data types and tensor shapes for custom matrix multiplication kernels:
|
|
@@ -2946,9 +3002,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 2946 |
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
| 2947 |
#endif // SYCL_USE_XMX
|
| 2948 |
|
|
|
|
| 2949 |
// mmvq path is faster in the CUDA backend.
|
| 2950 |
-
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2951 |
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
|
|
|
| 2952 |
|
| 2953 |
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
| 2954 |
// TODO: Refactor and cleanup of mul mat dispatching.
|
|
@@ -2967,17 +3029,23 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 2967 |
// KQ + KQV multi-batch
|
| 2968 |
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
| 2969 |
} else if (use_dequantize_mul_mat_vec) {
|
| 2970 |
-
|
| 2971 |
-
|
| 2972 |
-
|
| 2973 |
} else if (use_mul_mat_vec_q) {
|
| 2974 |
-
|
|
|
|
|
|
|
| 2975 |
} else if (use_mul_mat_q) {
|
| 2976 |
-
|
|
|
|
| 2977 |
} else {
|
| 2978 |
-
|
| 2979 |
-
|
|
|
|
|
|
|
| 2980 |
}
|
|
|
|
| 2981 |
}
|
| 2982 |
|
| 2983 |
|
|
|
|
| 49 |
int g_ggml_sycl_debug = 0;
|
| 50 |
int g_ggml_sycl_disable_optimize = 0;
|
| 51 |
int g_ggml_sycl_disable_graph = 0;
|
| 52 |
+
int g_ggml_sycl_prioritize_dmmv = 0;
|
| 53 |
|
| 54 |
static ggml_sycl_device_info ggml_sycl_init() {
|
| 55 |
ggml_sycl_device_info info = {};
|
|
|
|
| 196 |
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
| 197 |
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
|
| 198 |
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
| 199 |
+
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
| 200 |
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
| 201 |
GGML_LOG_INFO("Running with Environment Variables:\n");
|
| 202 |
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
| 203 |
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
|
| 204 |
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
|
| 205 |
+
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
| 206 |
GGML_LOG_INFO("Build with Macros:\n");
|
| 207 |
#if defined(GGML_SYCL_FORCE_MMQ)
|
| 208 |
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
|
|
|
|
| 2825 |
std::exit(1);
|
| 2826 |
}
|
| 2827 |
|
| 2828 |
+
enum class mul_mat_algo {
|
| 2829 |
+
DMMV = 0,
|
| 2830 |
+
MMVQ = 1,
|
| 2831 |
+
MUL_MAT_SYCL = 2,
|
| 2832 |
+
};
|
| 2833 |
+
|
| 2834 |
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
| 2835 |
// TODO: accuracy issues in MMQ
|
| 2836 |
GGML_UNUSED(type);
|
| 2837 |
return false;
|
| 2838 |
}
|
| 2839 |
|
| 2840 |
+
inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
| 2841 |
+
switch (type) {
|
| 2842 |
+
case GGML_TYPE_Q4_0:
|
| 2843 |
+
return true;
|
| 2844 |
+
default:
|
| 2845 |
+
return false;
|
| 2846 |
+
}
|
| 2847 |
+
}
|
| 2848 |
+
|
| 2849 |
+
inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
| 2850 |
+
switch (type) {
|
| 2851 |
+
case GGML_TYPE_Q4_0:
|
| 2852 |
+
return true;
|
| 2853 |
+
default:
|
| 2854 |
+
return false;
|
| 2855 |
+
}
|
| 2856 |
+
}
|
| 2857 |
+
|
| 2858 |
+
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
| 2859 |
+
switch (type) {
|
| 2860 |
+
case GGML_TYPE_Q4_0:
|
| 2861 |
+
return true;
|
| 2862 |
+
default:
|
| 2863 |
+
return false;
|
| 2864 |
+
}
|
| 2865 |
+
}
|
| 2866 |
+
|
| 2867 |
static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
| 2868 |
switch (type) {
|
| 2869 |
case GGML_TYPE_Q4_0:
|
|
|
|
| 2892 |
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
| 2893 |
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
| 2894 |
int offset_blks = offset / sizeof(block_q4_0);
|
| 2895 |
+
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;
|
| 2896 |
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
| 2897 |
|
| 2898 |
stream->parallel_for(
|
|
|
|
| 2920 |
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
| 2921 |
}
|
| 2922 |
|
| 2923 |
+
static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
|
| 2924 |
+
return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
|
| 2925 |
+
ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
|
| 2926 |
+
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
|
| 2927 |
+
dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
|
| 2928 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2929 |
|
| 2930 |
+
static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
|
| 2931 |
+
ggml_tensor * dst, mul_mat_algo mm_algorithm) {
|
| 2932 |
+
if (!should_reorder_tensor(*ctx, dst)) {
|
| 2933 |
+
return;
|
| 2934 |
+
}
|
| 2935 |
|
| 2936 |
+
ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
|
| 2937 |
+
if (!extra || extra->optimized_feature.reorder) {
|
| 2938 |
+
return; // Skip permutations and already reordered tensors
|
| 2939 |
+
}
|
| 2940 |
|
| 2941 |
+
switch (mm_algorithm) {
|
| 2942 |
+
case mul_mat_algo::DMMV:
|
| 2943 |
+
if (!ggml_sycl_supports_reorder_dmmv(src0->type)) {
|
| 2944 |
+
return;
|
| 2945 |
+
}
|
| 2946 |
+
break;
|
| 2947 |
+
case mul_mat_algo::MMVQ:
|
| 2948 |
+
if (!ggml_sycl_supports_reorder_mmvq(src0->type)) {
|
| 2949 |
+
return;
|
| 2950 |
+
}
|
| 2951 |
+
break;
|
| 2952 |
+
case mul_mat_algo::MUL_MAT_SYCL:
|
| 2953 |
+
if (!ggml_sycl_supports_reorder_mul_mat_sycl(src0->type)) {
|
| 2954 |
+
return;
|
| 2955 |
+
}
|
| 2956 |
+
break;
|
| 2957 |
}
|
| 2958 |
+
|
| 2959 |
+
reorder_qw(src0, ctx->stream());
|
| 2960 |
+
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
|
| 2961 |
}
|
| 2962 |
|
| 2963 |
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 2966 |
int64_t min_compute_capability = INT_MAX;
|
| 2967 |
|
| 2968 |
if (split) {
|
| 2969 |
+
ggml_backend_sycl_split_buffer_type_context * buft_ctx =
|
| 2970 |
+
(ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
|
| 2971 |
auto & tensor_split = buft_ctx->tensor_split;
|
| 2972 |
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
|
| 2973 |
// skip devices that are not going to do any work:
|
|
|
|
| 2980 |
}
|
| 2981 |
}
|
| 2982 |
} else {
|
| 2983 |
+
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
|
| 2984 |
}
|
| 2985 |
|
| 2986 |
// check data types and tensor shapes for custom matrix multiplication kernels:
|
|
|
|
| 3002 |
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
| 3003 |
#endif // SYCL_USE_XMX
|
| 3004 |
|
| 3005 |
+
|
| 3006 |
// mmvq path is faster in the CUDA backend.
|
| 3007 |
+
if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
|
| 3008 |
+
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
|
| 3009 |
+
// is enabled takes precedence over DMMV, the current if-else implementation
|
| 3010 |
+
// requires disabling DMMV if both conditions are met
|
| 3011 |
+
|| (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) {
|
| 3012 |
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
| 3013 |
+
}
|
| 3014 |
|
| 3015 |
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
| 3016 |
// TODO: Refactor and cleanup of mul mat dispatching.
|
|
|
|
| 3029 |
// KQ + KQV multi-batch
|
| 3030 |
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
| 3031 |
} else if (use_dequantize_mul_mat_vec) {
|
| 3032 |
+
constexpr bool convert_src1_to_q8_1 = false;
|
| 3033 |
+
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::DMMV);
|
| 3034 |
+
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1);
|
| 3035 |
} else if (use_mul_mat_vec_q) {
|
| 3036 |
+
constexpr bool convert_src1_to_q8_1 = true;
|
| 3037 |
+
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MMVQ);
|
| 3038 |
+
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1);
|
| 3039 |
} else if (use_mul_mat_q) {
|
| 3040 |
+
constexpr bool convert_src1_to_q8_1 = true;
|
| 3041 |
+
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
|
| 3042 |
} else {
|
| 3043 |
+
constexpr bool convert_src1_to_q8_1 = false;
|
| 3044 |
+
// MUL_MAT_SYCL supports reorder
|
| 3045 |
+
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MUL_MAT_SYCL);
|
| 3046 |
+
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
|
| 3047 |
}
|
| 3048 |
+
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
| 3049 |
}
|
| 3050 |
|
| 3051 |
|
ggml/src/ggml-sycl/mmvq.cpp
CHANGED
|
@@ -1,6 +1,60 @@
|
|
| 1 |
#include "mmvq.hpp"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
#include "vecdotq.hpp"
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
| 6 |
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
@@ -480,26 +534,39 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
|
| 480 |
}
|
| 481 |
}
|
| 482 |
|
| 483 |
-
static void
|
| 484 |
-
|
| 485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
dpct::queue_ptr stream) {
|
| 487 |
GGML_ASSERT(ncols % QK4_0 == 0);
|
| 488 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 489 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 490 |
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 491 |
-
{
|
| 492 |
-
|
| 493 |
-
stream->submit([&](sycl::handler &cgh) {
|
| 494 |
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
});
|
| 503 |
});
|
| 504 |
}
|
| 505 |
}
|
|
@@ -916,93 +983,95 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 916 |
}
|
| 917 |
}
|
| 918 |
|
| 919 |
-
void ggml_sycl_op_mul_mat_vec_q(
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
const int64_t src1_ncols, const int64_t src1_padded_col_size,
|
| 925 |
-
const dpct::queue_ptr &stream) {
|
| 926 |
-
|
| 927 |
const int64_t ne10 = src1->ne[0];
|
| 928 |
GGML_ASSERT(ne10 % QK8_1 == 0);
|
| 929 |
|
| 930 |
-
const int64_t ne00
|
| 931 |
const int64_t row_diff = row_high - row_low;
|
| 932 |
|
| 933 |
int id;
|
| 934 |
-
SYCL_CHECK(
|
| 935 |
-
CHECK_TRY_ERROR(id = get_current_device_id()));
|
| 936 |
const size_t q8_1_ts = sizeof(block_q8_1);
|
| 937 |
const size_t q8_1_bs = QK8_1;
|
| 938 |
// the main device has a larger memory buffer to hold the results from all GPUs
|
| 939 |
// nrows_dst == nrows of the matrix that the kernel writes into
|
| 940 |
|
| 941 |
-
for (int i = 0; i < src1_ncols; i++)
|
| 942 |
-
{
|
| 943 |
const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
|
| 944 |
-
const char* src1_ddq_i_bs
|
| 945 |
-
float*
|
| 946 |
switch (src0->type) {
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
|
| 955 |
-
|
| 956 |
-
|
| 957 |
-
|
| 958 |
-
|
| 959 |
-
|
| 960 |
-
|
| 961 |
-
|
| 962 |
-
|
| 963 |
-
|
| 964 |
-
|
| 965 |
-
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
-
|
| 971 |
-
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
|
| 977 |
-
|
| 978 |
-
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
|
| 983 |
-
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1006 |
}
|
| 1007 |
}
|
| 1008 |
GGML_UNUSED(src1);
|
|
|
|
| 1 |
#include "mmvq.hpp"
|
| 2 |
+
|
| 3 |
+
#include "ggml.h"
|
| 4 |
+
#include "common.hpp"
|
| 5 |
+
#include "quants.hpp"
|
| 6 |
#include "vecdotq.hpp"
|
| 7 |
+
|
| 8 |
+
template <typename reorder_vec_dot_q_sycl>
|
| 9 |
+
static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
| 10 |
+
const int ncols, const int nrows, const sycl::nd_item<3> & nd_item) {
|
| 11 |
+
using block_type = ggml_sycl_reordered::block_q_t<reorder_vec_dot_q_sycl::gtype>;
|
| 12 |
+
using block_traits = typename block_type::traits;
|
| 13 |
+
|
| 14 |
+
const auto sg = nd_item.get_sub_group();
|
| 15 |
+
const int sg_range = sg.get_group_linear_range();
|
| 16 |
+
const int workgroup_id = nd_item.get_group_linear_id();
|
| 17 |
+
const int sg_id = sg.get_group_linear_id();
|
| 18 |
+
const int row = workgroup_id * sg_range + sg_id;
|
| 19 |
+
|
| 20 |
+
if (row >= nrows) {
|
| 21 |
+
return;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
const int blocks_per_row = ncols / block_traits::qk;
|
| 25 |
+
constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
|
| 26 |
+
constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
|
| 27 |
+
|
| 28 |
+
static_assert(blocks_per_subgroup > 0);
|
| 29 |
+
static_assert(block_elements_per_subgroup > 0);
|
| 30 |
+
|
| 31 |
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 32 |
+
|
| 33 |
+
float partial_sum = 0.0f;
|
| 34 |
+
for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
|
| 35 |
+
const int ibx = row * blocks_per_row + i; // x block index
|
| 36 |
+
// TODO: Generalize offsets, right now only works for quantizations that don't split high and low bits
|
| 37 |
+
const int bx_offset = block_type::get_block_offset(ibx);
|
| 38 |
+
const int d_offset = block_type::get_d_offset(nrows, ncols, ibx);
|
| 39 |
+
|
| 40 |
+
// Y block index that aligns with ibx
|
| 41 |
+
const int iby = i * block_type::block_to_q8_1_ratio();
|
| 42 |
+
|
| 43 |
+
#pragma unroll
|
| 44 |
+
for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
|
| 45 |
+
// x block quant index when casting the quants to int
|
| 46 |
+
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
|
| 47 |
+
|
| 48 |
+
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs);
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
auto sum = sycl::reduce_over_group(nd_item.get_sub_group(), partial_sum, std::plus<>());
|
| 53 |
+
|
| 54 |
+
if (sg.leader()) {
|
| 55 |
+
dst[row] = sum;
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
|
| 59 |
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
| 60 |
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
|
|
| 534 |
}
|
| 535 |
}
|
| 536 |
|
| 537 |
+
static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
| 538 |
+
const int nrows, dpct::queue_ptr stream) {
|
| 539 |
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
| 540 |
+
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
| 541 |
+
constexpr size_t num_subgroups = 16;
|
| 542 |
+
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
| 543 |
+
|
| 544 |
+
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
|
| 545 |
+
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
| 546 |
+
|
| 547 |
+
stream->submit([&](sycl::handler & cgh) {
|
| 548 |
+
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
| 549 |
+
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 550 |
+
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
|
| 551 |
+
nd_item);
|
| 552 |
+
});
|
| 553 |
+
});
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows,
|
| 557 |
dpct::queue_ptr stream) {
|
| 558 |
GGML_ASSERT(ncols % QK4_0 == 0);
|
| 559 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 560 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 561 |
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
|
|
|
|
|
|
|
|
| 562 |
|
| 563 |
+
{
|
| 564 |
+
stream->submit([&](sycl::handler & cgh) {
|
| 565 |
+
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 566 |
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 567 |
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
| 568 |
+
vx, vy, dst, ncols, nrows, item_ct1);
|
| 569 |
+
});
|
|
|
|
| 570 |
});
|
| 571 |
}
|
| 572 |
}
|
|
|
|
| 983 |
}
|
| 984 |
}
|
| 985 |
|
| 986 |
+
void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
|
| 987 |
+
ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 988 |
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low,
|
| 989 |
+
const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_col_size,
|
| 990 |
+
const dpct::queue_ptr & stream) {
|
|
|
|
|
|
|
|
|
|
| 991 |
const int64_t ne10 = src1->ne[0];
|
| 992 |
GGML_ASSERT(ne10 % QK8_1 == 0);
|
| 993 |
|
| 994 |
+
const int64_t ne00 = src0->ne[0];
|
| 995 |
const int64_t row_diff = row_high - row_low;
|
| 996 |
|
| 997 |
int id;
|
| 998 |
+
SYCL_CHECK(CHECK_TRY_ERROR(id = get_current_device_id()));
|
|
|
|
| 999 |
const size_t q8_1_ts = sizeof(block_q8_1);
|
| 1000 |
const size_t q8_1_bs = QK8_1;
|
| 1001 |
// the main device has a larger memory buffer to hold the results from all GPUs
|
| 1002 |
// nrows_dst == nrows of the matrix that the kernel writes into
|
| 1003 |
|
| 1004 |
+
for (int i = 0; i < src1_ncols; i++) {
|
|
|
|
| 1005 |
const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
|
| 1006 |
+
const char * src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
|
| 1007 |
+
float * dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
|
| 1008 |
switch (src0->type) {
|
| 1009 |
+
case GGML_TYPE_Q4_0:
|
| 1010 |
+
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
| 1011 |
+
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
| 1012 |
+
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n");
|
| 1013 |
+
reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1014 |
+
} else {
|
| 1015 |
+
GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl\n");
|
| 1016 |
+
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1017 |
+
}
|
| 1018 |
+
break;
|
| 1019 |
+
case GGML_TYPE_Q4_1:
|
| 1020 |
+
mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1021 |
+
break;
|
| 1022 |
+
case GGML_TYPE_Q5_0:
|
| 1023 |
+
mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1024 |
+
break;
|
| 1025 |
+
case GGML_TYPE_Q5_1:
|
| 1026 |
+
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1027 |
+
break;
|
| 1028 |
+
case GGML_TYPE_Q8_0:
|
| 1029 |
+
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1030 |
+
break;
|
| 1031 |
+
case GGML_TYPE_Q2_K:
|
| 1032 |
+
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1033 |
+
break;
|
| 1034 |
+
case GGML_TYPE_Q3_K:
|
| 1035 |
+
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1036 |
+
break;
|
| 1037 |
+
case GGML_TYPE_Q4_K:
|
| 1038 |
+
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1039 |
+
break;
|
| 1040 |
+
case GGML_TYPE_Q5_K:
|
| 1041 |
+
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1042 |
+
break;
|
| 1043 |
+
case GGML_TYPE_Q6_K:
|
| 1044 |
+
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1045 |
+
break;
|
| 1046 |
+
case GGML_TYPE_IQ1_S:
|
| 1047 |
+
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1048 |
+
break;
|
| 1049 |
+
case GGML_TYPE_IQ1_M:
|
| 1050 |
+
mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1051 |
+
break;
|
| 1052 |
+
case GGML_TYPE_IQ2_XXS:
|
| 1053 |
+
mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1054 |
+
break;
|
| 1055 |
+
case GGML_TYPE_IQ2_XS:
|
| 1056 |
+
mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1057 |
+
break;
|
| 1058 |
+
case GGML_TYPE_IQ2_S:
|
| 1059 |
+
mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1060 |
+
break;
|
| 1061 |
+
case GGML_TYPE_IQ3_XXS:
|
| 1062 |
+
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1063 |
+
break;
|
| 1064 |
+
case GGML_TYPE_IQ3_S:
|
| 1065 |
+
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1066 |
+
break;
|
| 1067 |
+
case GGML_TYPE_IQ4_NL:
|
| 1068 |
+
mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1069 |
+
break;
|
| 1070 |
+
case GGML_TYPE_IQ4_XS:
|
| 1071 |
+
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
| 1072 |
+
break;
|
| 1073 |
+
default:
|
| 1074 |
+
GGML_ABORT("fatal error");
|
| 1075 |
}
|
| 1076 |
}
|
| 1077 |
GGML_UNUSED(src1);
|
ggml/src/ggml-sycl/quants.hpp
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// MIT license
|
| 3 |
+
// Copyright (C) 2025 Codeplay Software Ltd.
|
| 4 |
+
// Copyright (C) 2025 Intel Corporation
|
| 5 |
+
// SPDX-License-Identifier: MIT
|
| 6 |
+
//
|
| 7 |
+
|
| 8 |
+
//
|
| 9 |
+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
| 10 |
+
// See https://llvm.org/LICENSE.txt for license information.
|
| 11 |
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
| 12 |
+
//
|
| 13 |
+
|
| 14 |
+
#ifndef GGML_SYCL_QUANTS_HPP
|
| 15 |
+
#define GGML_SYCL_QUANTS_HPP
|
| 16 |
+
|
| 17 |
+
#include "ggml-common.h"
|
| 18 |
+
#include "ggml.h"
|
| 19 |
+
|
| 20 |
+
namespace ggml_sycl_reordered {
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
// The reordered block moves quants (qs) and scales(d) to two
|
| 24 |
+
// uniform regions of memory that is contiguous in the same tensor.
|
| 25 |
+
// What this means is that instead of having:
|
| 26 |
+
// [d0, qs0] [d1, qs1] [d2, qs2] ... [dN, qsN]
|
| 27 |
+
// We have:
|
| 28 |
+
// [qs0, qs1, qs2, ..., qsN] [d0, d1, d2, ..., dN]
|
| 29 |
+
//
|
| 30 |
+
// Notes: out-of-bounds qs will run into d values
|
| 31 |
+
// Aligment relies on the allocated size of qs
|
| 32 |
+
|
| 33 |
+
template <ggml_type type> struct block_q_t;
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
// qk number of weights / quants in a block
|
| 37 |
+
// qr number of weights in a byte (described as 'before dequantization')
|
| 38 |
+
// for quantization types that has low and high bits split, qr is calculated with
|
| 39 |
+
// using the lower bits, e.g for Q6 quants QR6 is 2
|
| 40 |
+
// qi number of 32 bit integers needed to represent all the quants from a block (`qs` field)
|
| 41 |
+
// See ggml-common.h to see how these are calculated
|
| 42 |
+
template <> struct block_q_t<GGML_TYPE_Q4_0> {
|
| 43 |
+
struct traits {
|
| 44 |
+
static constexpr uint32_t qk = QK4_0;
|
| 45 |
+
static constexpr uint32_t qi = QI4_0;
|
| 46 |
+
static constexpr uint32_t qr = QR4_0;
|
| 47 |
+
static constexpr uint32_t vdr_mmvq = 2;
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
|
| 51 |
+
|
| 52 |
+
static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
|
| 53 |
+
return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
| 57 |
+
};
|
| 58 |
+
|
| 59 |
+
} // namespace ggml_sycl_reordered
|
| 60 |
+
|
| 61 |
+
#endif // GGML_SYCL_QUANTS_HPP
|
ggml/src/ggml-sycl/vecdotq.hpp
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
//
|
| 2 |
// MIT license
|
| 3 |
-
// Copyright (C)
|
| 4 |
// SPDX-License-Identifier: MIT
|
| 5 |
//
|
| 6 |
|
|
@@ -14,8 +14,11 @@
|
|
| 14 |
#define GGML_SYCL_VECDOTQ_HPP
|
| 15 |
|
| 16 |
#include "dpct/helper.hpp"
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
|
|
|
|
| 19 |
|
| 20 |
static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
|
| 21 |
const uint16_t* x16 =
|
|
@@ -252,13 +255,60 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
|
|
| 252 |
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
| 253 |
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
#define VDR_Q4_0_Q8_1_MMVQ 2
|
| 256 |
#define VDR_Q4_0_Q8_1_MMQ 4
|
| 257 |
|
| 258 |
template <int vdr>
|
| 259 |
-
static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
|
| 260 |
-
const
|
| 261 |
-
const sycl::half2 &ds8) {
|
| 262 |
int sumi = 0;
|
| 263 |
#pragma unroll
|
| 264 |
for (int i = 0; i < vdr; ++i) {
|
|
@@ -270,8 +320,7 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
|
|
| 270 |
sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
|
| 271 |
}
|
| 272 |
|
| 273 |
-
const sycl::float2 ds8f =
|
| 274 |
-
ds8.convert<float, sycl::rounding_mode::automatic>();
|
| 275 |
|
| 276 |
// second part effectively subtracts 8 from each quant value
|
| 277 |
return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
|
|
@@ -456,13 +505,13 @@ vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
|
|
| 456 |
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
| 457 |
|
| 458 |
int v[VDR_Q4_0_Q8_1_MMVQ];
|
| 459 |
-
int u[2*VDR_Q4_0_Q8_1_MMVQ];
|
| 460 |
|
| 461 |
#pragma unroll
|
| 462 |
for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
|
| 463 |
-
v[i]
|
| 464 |
-
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
| 465 |
-
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
|
| 466 |
}
|
| 467 |
|
| 468 |
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
|
|
|
| 1 |
//
|
| 2 |
// MIT license
|
| 3 |
+
// Copyright (C) 2025 Intel Corporation
|
| 4 |
// SPDX-License-Identifier: MIT
|
| 5 |
//
|
| 6 |
|
|
|
|
| 14 |
#define GGML_SYCL_VECDOTQ_HPP
|
| 15 |
|
| 16 |
#include "dpct/helper.hpp"
|
| 17 |
+
#include "ggml.h"
|
| 18 |
+
#include "quants.hpp"
|
| 19 |
|
| 20 |
+
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
|
| 21 |
+
const int & iqs);
|
| 22 |
|
| 23 |
static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
|
| 24 |
const uint16_t* x16 =
|
|
|
|
| 255 |
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
| 256 |
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
| 257 |
|
| 258 |
+
template <ggml_type T> struct reorder_vec_dot_q_sycl {
|
| 259 |
+
static_assert(T != T, "ggml_type for reorder vecdot not implemented");
|
| 260 |
+
};
|
| 261 |
+
|
| 262 |
+
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
|
| 263 |
+
static constexpr ggml_type gtype = GGML_TYPE_Q4_0;
|
| 264 |
+
|
| 265 |
+
using q4_0_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_0>;
|
| 266 |
+
using q4_0_traits = typename q4_0_block::traits;
|
| 267 |
+
|
| 268 |
+
__dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4, const sycl::half2 & ds8) {
|
| 269 |
+
int sumi = 0;
|
| 270 |
+
|
| 271 |
+
#pragma unroll
|
| 272 |
+
for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
|
| 273 |
+
const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
|
| 274 |
+
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
| 275 |
+
|
| 276 |
+
// SIMD dot product of quantized values
|
| 277 |
+
sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
|
| 278 |
+
sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
|
| 282 |
+
|
| 283 |
+
// second part effectively subtracts 8 from each quant value
|
| 284 |
+
return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y());
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
__dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
|
| 288 |
+
const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
| 289 |
+
const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
|
| 290 |
+
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
|
| 291 |
+
int v[q4_0_traits::vdr_mmvq];
|
| 292 |
+
int u[2 * q4_0_traits::vdr_mmvq];
|
| 293 |
+
|
| 294 |
+
#pragma unroll
|
| 295 |
+
|
| 296 |
+
for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
|
| 297 |
+
v[i] = get_int_from_uint8(bq4_0, iqs + i);
|
| 298 |
+
u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
| 299 |
+
u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + q4_0_traits::qi);
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
return vec_dot_q4_0_q8_1_impl(v, u, d, bq8_1->ds);
|
| 303 |
+
};
|
| 304 |
+
};
|
| 305 |
+
|
| 306 |
#define VDR_Q4_0_Q8_1_MMVQ 2
|
| 307 |
#define VDR_Q4_0_Q8_1_MMQ 4
|
| 308 |
|
| 309 |
template <int vdr>
|
| 310 |
+
static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4,
|
| 311 |
+
const sycl::half2 & ds8) {
|
|
|
|
| 312 |
int sumi = 0;
|
| 313 |
#pragma unroll
|
| 314 |
for (int i = 0; i < vdr; ++i) {
|
|
|
|
| 320 |
sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
|
| 321 |
}
|
| 322 |
|
| 323 |
+
const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
|
|
|
|
| 324 |
|
| 325 |
// second part effectively subtracts 8 from each quant value
|
| 326 |
return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
|
|
|
|
| 505 |
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
| 506 |
|
| 507 |
int v[VDR_Q4_0_Q8_1_MMVQ];
|
| 508 |
+
int u[2 * VDR_Q4_0_Q8_1_MMVQ];
|
| 509 |
|
| 510 |
#pragma unroll
|
| 511 |
for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
|
| 512 |
+
v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
|
| 513 |
+
u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
| 514 |
+
u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
|
| 515 |
}
|
| 516 |
|
| 517 |
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|