Spaces:
Running
Running
Neo Zhang Jianyu
arthw
commited on
Commit
·
14fd317
1
Parent(s):
310a36c
Optimize mul_mat for Q4_0 on Intel GPU (llama/12035)
Browse files* opt performance by reorder for Intel GPU
* detect hw type and save opt feature, and print opt feature
* correct name
* support optimize graph once when compute graph, record the opt status in tensor->extra, make CI passed
* add env variable GGML_SYCL_DISABLE_OPT for debug
* use syclex::architecture replace the custom hw define, update the guide for GGML_SYCL_DISABLE_OPT
* add performance data
* mv getrows functions to separeted files
* fix global variables
---------
Co-authored-by: arthw <[email protected]>
- ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- ggml/src/ggml-sycl/common.cpp +17 -0
- ggml/src/ggml-sycl/common.hpp +49 -9
- ggml/src/ggml-sycl/convert.cpp +33 -4
- ggml/src/ggml-sycl/convert.hpp +2 -2
- ggml/src/ggml-sycl/dequantize.hpp +55 -0
- ggml/src/ggml-sycl/dmmv.cpp +136 -4
- ggml/src/ggml-sycl/getrows.cpp +308 -0
- ggml/src/ggml-sycl/getrows.hpp +23 -0
- ggml/src/ggml-sycl/ggml-sycl.cpp +126 -243
- ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
ggml/src/ggml-sycl/CMakeLists.txt
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
|
| 2 |
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
|
| 3 |
endif()
|
|
|
|
| 1 |
+
message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
|
| 2 |
+
|
| 3 |
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
|
| 4 |
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
|
| 5 |
endif()
|
ggml/src/ggml-sycl/common.cpp
CHANGED
|
@@ -99,3 +99,20 @@ catch (sycl::exception const &exc) {
|
|
| 99 |
<< ", line:" << __LINE__ << std::endl;
|
| 100 |
std::exit(1);
|
| 101 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
<< ", line:" << __LINE__ << std::endl;
|
| 100 |
std::exit(1);
|
| 101 |
}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
|
| 105 |
+
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
| 106 |
+
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
|
| 107 |
+
if (extra->events[i][is] != nullptr) {
|
| 108 |
+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is])));
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
if (extra->data_device[i] != nullptr && streams.size()>0) {
|
| 112 |
+
ggml_sycl_set_device(i);
|
| 113 |
+
SYCL_CHECK(
|
| 114 |
+
CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
delete extra;
|
| 118 |
+
}
|
ggml/src/ggml-sycl/common.hpp
CHANGED
|
@@ -19,6 +19,9 @@
|
|
| 19 |
#include "dpct/helper.hpp"
|
| 20 |
#include "ggml-sycl.h"
|
| 21 |
#include "presets.hpp"
|
|
|
|
|
|
|
|
|
|
| 22 |
#if GGML_SYCL_DNNL
|
| 23 |
#include "dnnl.hpp"
|
| 24 |
#include "dnnl_sycl.hpp"
|
|
@@ -35,7 +38,10 @@
|
|
| 35 |
void* ggml_sycl_host_malloc(size_t size);
|
| 36 |
void ggml_sycl_host_free(void* ptr);
|
| 37 |
|
|
|
|
| 38 |
extern int g_ggml_sycl_debug;
|
|
|
|
|
|
|
| 39 |
#define GGML_SYCL_DEBUG(...) \
|
| 40 |
do { \
|
| 41 |
if (g_ggml_sycl_debug) \
|
|
@@ -182,18 +188,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
|
|
| 182 |
}
|
| 183 |
|
| 184 |
//////////////////////
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
struct ggml_sycl_device_info {
|
| 187 |
int device_count;
|
| 188 |
|
| 189 |
-
struct sycl_device_info {
|
| 190 |
-
int cc; // compute capability
|
| 191 |
-
// int nsm; // number of streaming multiprocessors
|
| 192 |
-
// size_t smpb; // max. shared memory per block
|
| 193 |
-
bool vmm; // virtual memory support
|
| 194 |
-
size_t total_vram;
|
| 195 |
-
};
|
| 196 |
-
|
| 197 |
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
|
| 198 |
|
| 199 |
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
|
@@ -260,17 +272,46 @@ struct ggml_tensor_extra_gpu {
|
|
| 260 |
// tensors
|
| 261 |
dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
|
| 262 |
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
|
|
|
|
| 263 |
};
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
struct ggml_backend_sycl_context {
|
| 266 |
int device;
|
| 267 |
std::string name;
|
|
|
|
|
|
|
| 268 |
|
| 269 |
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
|
| 270 |
|
| 271 |
explicit ggml_backend_sycl_context(int device) :
|
| 272 |
device(device),
|
| 273 |
name(GGML_SYCL_NAME + std::to_string(device)) {
|
|
|
|
| 274 |
}
|
| 275 |
|
| 276 |
queue_ptr stream(int device, int stream) {
|
|
@@ -680,5 +721,4 @@ bool gpu_has_xmx(sycl::device &dev);
|
|
| 680 |
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 681 |
const ggml_tensor *src1, ggml_tensor *dst,
|
| 682 |
const ggml_sycl_op_flatten_t op);
|
| 683 |
-
|
| 684 |
#endif // GGML_SYCL_COMMON_HPP
|
|
|
|
| 19 |
#include "dpct/helper.hpp"
|
| 20 |
#include "ggml-sycl.h"
|
| 21 |
#include "presets.hpp"
|
| 22 |
+
#include "sycl_hw.hpp"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
#if GGML_SYCL_DNNL
|
| 26 |
#include "dnnl.hpp"
|
| 27 |
#include "dnnl_sycl.hpp"
|
|
|
|
| 38 |
void* ggml_sycl_host_malloc(size_t size);
|
| 39 |
void ggml_sycl_host_free(void* ptr);
|
| 40 |
|
| 41 |
+
|
| 42 |
extern int g_ggml_sycl_debug;
|
| 43 |
+
extern int g_ggml_sycl_disable_optimize;
|
| 44 |
+
|
| 45 |
#define GGML_SYCL_DEBUG(...) \
|
| 46 |
do { \
|
| 47 |
if (g_ggml_sycl_debug) \
|
|
|
|
| 188 |
}
|
| 189 |
|
| 190 |
//////////////////////
|
| 191 |
+
struct optimize_feature {
|
| 192 |
+
bool reorder=false;
|
| 193 |
+
};
|
| 194 |
+
|
| 195 |
+
struct sycl_device_info {
|
| 196 |
+
int cc; // compute capability
|
| 197 |
+
// int nsm; // number of streaming multiprocessors
|
| 198 |
+
// size_t smpb; // max. shared memory per block
|
| 199 |
+
bool vmm; // virtual memory support
|
| 200 |
+
size_t total_vram;
|
| 201 |
+
sycl_hw_info hw_info;
|
| 202 |
+
optimize_feature opt_feature;
|
| 203 |
+
};
|
| 204 |
+
|
| 205 |
|
| 206 |
struct ggml_sycl_device_info {
|
| 207 |
int device_count;
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
|
| 210 |
|
| 211 |
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
|
|
|
| 272 |
// tensors
|
| 273 |
dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
|
| 274 |
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
|
| 275 |
+
optimize_feature optimized_feature;
|
| 276 |
};
|
| 277 |
|
| 278 |
+
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
|
| 279 |
+
|
| 280 |
+
inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
|
| 281 |
+
optimize_feature opt;
|
| 282 |
+
|
| 283 |
+
opt.reorder =
|
| 284 |
+
(arch == syclex::architecture::intel_gpu_dg1 ||
|
| 285 |
+
arch == syclex::architecture::intel_gpu_acm_g10 ||
|
| 286 |
+
arch == syclex::architecture::intel_gpu_acm_g11 ||
|
| 287 |
+
arch == syclex::architecture::intel_gpu_acm_g12 ||
|
| 288 |
+
arch == syclex::architecture::intel_gpu_pvc ||
|
| 289 |
+
arch == syclex::architecture::intel_gpu_pvc_vg ||
|
| 290 |
+
arch == syclex::architecture::intel_gpu_mtl_u ||
|
| 291 |
+
arch == syclex::architecture::intel_gpu_mtl_s ||
|
| 292 |
+
arch == syclex::architecture::intel_gpu_mtl_h ||
|
| 293 |
+
arch == syclex::architecture::intel_gpu_arl_u ||
|
| 294 |
+
arch == syclex::architecture::intel_gpu_arl_s ||
|
| 295 |
+
arch == syclex::architecture::intel_gpu_arl_h ||
|
| 296 |
+
arch == syclex::architecture::intel_gpu_bmg_g21 ||
|
| 297 |
+
arch == syclex::architecture::intel_gpu_lnl_m
|
| 298 |
+
);
|
| 299 |
+
|
| 300 |
+
return opt;
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
struct ggml_backend_sycl_context {
|
| 304 |
int device;
|
| 305 |
std::string name;
|
| 306 |
+
optimize_feature opt_feature;
|
| 307 |
+
bool optimized_graph=false;
|
| 308 |
|
| 309 |
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
|
| 310 |
|
| 311 |
explicit ggml_backend_sycl_context(int device) :
|
| 312 |
device(device),
|
| 313 |
name(GGML_SYCL_NAME + std::to_string(device)) {
|
| 314 |
+
opt_feature = ggml_sycl_info().devices[device].opt_feature;
|
| 315 |
}
|
| 316 |
|
| 317 |
queue_ptr stream(int device, int stream) {
|
|
|
|
| 721 |
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 722 |
const ggml_tensor *src1, ggml_tensor *dst,
|
| 723 |
const ggml_sycl_op_flatten_t op);
|
|
|
|
| 724 |
#endif // GGML_SYCL_COMMON_HPP
|
ggml/src/ggml-sycl/convert.cpp
CHANGED
|
@@ -125,6 +125,25 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
| 125 |
}
|
| 126 |
}
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
template <typename dst_t>
|
| 129 |
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
|
| 130 |
dpct::queue_ptr stream) {
|
|
@@ -452,10 +471,15 @@ static void convert_unary_sycl(const void *__restrict__ vx,
|
|
| 452 |
}
|
| 453 |
}
|
| 454 |
|
| 455 |
-
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
|
| 456 |
switch (type) {
|
| 457 |
case GGML_TYPE_Q4_0:
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
case GGML_TYPE_Q4_1:
|
| 460 |
return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
|
| 461 |
case GGML_TYPE_Q5_0:
|
|
@@ -499,10 +523,15 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
|
|
| 499 |
}
|
| 500 |
}
|
| 501 |
|
| 502 |
-
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
|
| 503 |
switch (type) {
|
| 504 |
case GGML_TYPE_Q4_0:
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
case GGML_TYPE_Q4_1:
|
| 507 |
return dequantize_row_q4_1_sycl;
|
| 508 |
case GGML_TYPE_Q5_0:
|
|
|
|
| 125 |
}
|
| 126 |
}
|
| 127 |
|
| 128 |
+
template <typename dst_t>
|
| 129 |
+
static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
|
| 130 |
+
dpct::queue_ptr stream) {
|
| 131 |
+
|
| 132 |
+
dpct::has_capability_or_fail(stream->get_device(),
|
| 133 |
+
{sycl::aspect::fp16});
|
| 134 |
+
|
| 135 |
+
int constexpr WARP_K = WARP_SIZE * QK4_0;
|
| 136 |
+
const int n_warp = (k + WARP_K - 1) / WARP_K;
|
| 137 |
+
GGML_ASSERT(k % 2 == 0);
|
| 138 |
+
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
|
| 139 |
+
sycl::range<3>(1, 1, WARP_SIZE),
|
| 140 |
+
sycl::range<3>(1, 1, WARP_SIZE)),
|
| 141 |
+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
|
| 142 |
+
dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
|
| 143 |
+
});
|
| 144 |
+
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
template <typename dst_t>
|
| 148 |
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
|
| 149 |
dpct::queue_ptr stream) {
|
|
|
|
| 471 |
}
|
| 472 |
}
|
| 473 |
|
| 474 |
+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
|
| 475 |
switch (type) {
|
| 476 |
case GGML_TYPE_Q4_0:
|
| 477 |
+
if (dst->src[0]->extra &&
|
| 478 |
+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
| 479 |
+
return dequantize_row_q4_0_sycl_reorder;
|
| 480 |
+
} else {
|
| 481 |
+
return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
|
| 482 |
+
}
|
| 483 |
case GGML_TYPE_Q4_1:
|
| 484 |
return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
|
| 485 |
case GGML_TYPE_Q5_0:
|
|
|
|
| 523 |
}
|
| 524 |
}
|
| 525 |
|
| 526 |
+
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
| 527 |
switch (type) {
|
| 528 |
case GGML_TYPE_Q4_0:
|
| 529 |
+
if (dst->src[0]->extra &&
|
| 530 |
+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
| 531 |
+
return dequantize_row_q4_0_sycl_reorder;
|
| 532 |
+
} else {
|
| 533 |
+
return dequantize_row_q4_0_sycl;
|
| 534 |
+
}
|
| 535 |
case GGML_TYPE_Q4_1:
|
| 536 |
return dequantize_row_q4_1_sycl;
|
| 537 |
case GGML_TYPE_Q5_0:
|
ggml/src/ggml-sycl/convert.hpp
CHANGED
|
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
|
|
| 21 |
typedef to_t_sycl_t<float> to_fp32_sycl_t;
|
| 22 |
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
|
| 23 |
|
| 24 |
-
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
|
| 25 |
-
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
|
| 26 |
|
| 27 |
#endif // GGML_SYCL_CONVERT_HPP
|
|
|
|
| 21 |
typedef to_t_sycl_t<float> to_fp32_sycl_t;
|
| 22 |
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
|
| 23 |
|
| 24 |
+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
|
| 25 |
+
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
|
| 26 |
|
| 27 |
#endif // GGML_SYCL_CONVERT_HPP
|
ggml/src/ggml-sycl/dequantize.hpp
CHANGED
|
@@ -16,6 +16,8 @@
|
|
| 16 |
#include "common.hpp"
|
| 17 |
|
| 18 |
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
|
|
|
|
|
|
| 19 |
|
| 20 |
static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
|
| 21 |
const int iqs, dfloat2 &v) {
|
|
@@ -40,6 +42,29 @@ static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
|
|
| 40 |
#endif // GGML_SYCL_F16
|
| 41 |
}
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
|
| 44 |
const int iqs, dfloat2 &v) {
|
| 45 |
const block_q4_1 * x = (const block_q4_1 *) vx;
|
|
@@ -167,6 +192,36 @@ static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restri
|
|
| 167 |
}
|
| 168 |
}
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
template<typename dst_t>
|
| 171 |
static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
|
| 172 |
const sycl::nd_item<3> &item_ct1) {
|
|
|
|
| 16 |
#include "common.hpp"
|
| 17 |
|
| 18 |
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
| 19 |
+
typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs,
|
| 20 |
+
const int iqs, dfloat2 &v);
|
| 21 |
|
| 22 |
static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
|
| 23 |
const int iqs, dfloat2 &v) {
|
|
|
|
| 42 |
#endif // GGML_SYCL_F16
|
| 43 |
}
|
| 44 |
|
| 45 |
+
static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
|
| 46 |
+
const int iqs, dfloat2 &v) {
|
| 47 |
+
// const block_q4_0 * x = (const block_q4_0 *) vx;
|
| 48 |
+
|
| 49 |
+
const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
|
| 50 |
+
|
| 51 |
+
const int vui = *((const uint8_t *)qs+iqs);
|
| 52 |
+
|
| 53 |
+
v.x() = vui & 0xF;
|
| 54 |
+
v.y() = vui >> 4;
|
| 55 |
+
|
| 56 |
+
#ifdef GGML_SYCL_F16
|
| 57 |
+
// v = v - {8.0f, 8.0f};
|
| 58 |
+
// v = v * {d, d};
|
| 59 |
+
v.s0() = (v.s0() - 8.0f) * d;
|
| 60 |
+
v.s1() = (v.s1() - 8.0f) * d;
|
| 61 |
+
|
| 62 |
+
#else
|
| 63 |
+
v.x() = (v.x() - 8.0f) * d;
|
| 64 |
+
v.y() = (v.y() - 8.0f) * d;
|
| 65 |
+
#endif // GGML_SYCL_F16
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
|
| 69 |
const int iqs, dfloat2 &v) {
|
| 70 |
const block_q4_1 * x = (const block_q4_1 *) vx;
|
|
|
|
| 192 |
}
|
| 193 |
}
|
| 194 |
|
| 195 |
+
template<typename dst_t>
|
| 196 |
+
static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
|
| 197 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 198 |
+
|
| 199 |
+
const int64_t i = item_ct1.get_group(2);
|
| 200 |
+
auto k=nb32;
|
| 201 |
+
// assume 32 threads
|
| 202 |
+
const int64_t tid = item_ct1.get_local_id(2);
|
| 203 |
+
const int lane_ib = i * WARP_SIZE + tid;
|
| 204 |
+
|
| 205 |
+
if (lane_ib >= k / QK4_0) {
|
| 206 |
+
return;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
dst_t * y_ptr = yy + lane_ib * QK4_0;
|
| 210 |
+
|
| 211 |
+
auto qs = (const uint8_t*)vx + lane_ib * QK4_0 / 2;
|
| 212 |
+
auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k / 2) + lane_ib;
|
| 213 |
+
|
| 214 |
+
const float d = float(*s_ptr);
|
| 215 |
+
|
| 216 |
+
#pragma unroll
|
| 217 |
+
for (int l = 0; l < QK4_0 / 2; ++l) {
|
| 218 |
+
int vq = qs[l];
|
| 219 |
+
y_ptr[l + 0] = d * ((vq & 0xF) - 8);
|
| 220 |
+
y_ptr[l + 16] = d * ((vq >> 4) - 8);
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
template<typename dst_t>
|
| 226 |
static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
|
| 227 |
const sycl::nd_item<3> &item_ct1) {
|
ggml/src/ggml-sycl/dmmv.cpp
CHANGED
|
@@ -3,7 +3,6 @@
|
|
| 3 |
#include "dequantize.hpp"
|
| 4 |
#include "presets.hpp"
|
| 5 |
|
| 6 |
-
|
| 7 |
static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
| 8 |
const sycl::half *x = (const sycl::half *)vx;
|
| 9 |
|
|
@@ -91,6 +90,112 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
|
|
| 91 |
}
|
| 92 |
}
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
| 95 |
float *dst, const int ncols,
|
| 96 |
const int nrows,
|
|
@@ -759,6 +864,28 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
| 759 |
}
|
| 760 |
}
|
| 761 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 762 |
|
| 763 |
static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
|
| 764 |
float *dst, const int ncols,
|
|
@@ -953,7 +1080,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
| 953 |
|
| 954 |
const int64_t ne00 = src0->ne[0];
|
| 955 |
const int64_t row_diff = row_high - row_low;
|
| 956 |
-
|
| 957 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 958 |
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
| 959 |
#ifdef GGML_SYCL_F16
|
|
@@ -967,7 +1093,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
| 967 |
|
| 968 |
if (src1_convert_f16) {
|
| 969 |
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
| 970 |
-
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
|
| 971 |
GGML_ASSERT(to_fp16_sycl != nullptr);
|
| 972 |
to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
|
| 973 |
}
|
|
@@ -977,7 +1103,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
| 977 |
|
| 978 |
switch (src0->type) {
|
| 979 |
case GGML_TYPE_Q4_0:
|
| 980 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 981 |
break;
|
| 982 |
case GGML_TYPE_Q4_1:
|
| 983 |
dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
|
@@ -1020,4 +1151,5 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
| 1020 |
GGML_UNUSED(src1_ddq_i);
|
| 1021 |
GGML_UNUSED(src1_ncols);
|
| 1022 |
GGML_UNUSED(src1_padded_row_size);
|
|
|
|
| 1023 |
}
|
|
|
|
| 3 |
#include "dequantize.hpp"
|
| 4 |
#include "presets.hpp"
|
| 5 |
|
|
|
|
| 6 |
static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
| 7 |
const sycl::half *x = (const sycl::half *)vx;
|
| 8 |
|
|
|
|
| 90 |
}
|
| 91 |
}
|
| 92 |
|
| 93 |
+
template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_reorder>
|
| 94 |
+
static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
|
| 95 |
+
const sycl::nd_item<3> &item_ct1) {
|
| 96 |
+
// qk = quantized weights per x block
|
| 97 |
+
// qr = number of quantized weights per data value in x block
|
| 98 |
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 99 |
+
item_ct1.get_local_id(1);
|
| 100 |
+
|
| 101 |
+
if (row >= nrows) {
|
| 102 |
+
return;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
const int tid = item_ct1.get_local_id(2);
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
const int ncols_left = ncols % (QK4_0*WARP_SIZE);
|
| 109 |
+
const int ncols_align = ncols - ncols_left;
|
| 110 |
+
const int iter_stride = 8*2*GGML_SYCL_DMMV_X;
|
| 111 |
+
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter //64/16=4, 512/16/2= 16
|
| 112 |
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
| 113 |
+
|
| 114 |
+
// partial sum for each thread
|
| 115 |
+
#ifdef GGML_SYCL_F16
|
| 116 |
+
sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
| 117 |
+
#else
|
| 118 |
+
float tmp = 0.0f;
|
| 119 |
+
#endif // GGML_SYCL_F16
|
| 120 |
+
const char *d_ptr = (const char*)vx+ncols*nrows/2;
|
| 121 |
+
int i=0;
|
| 122 |
+
for (i = 0; i < ncols_align; i += iter_stride) {
|
| 123 |
+
const int col = i + vals_per_iter*tid;
|
| 124 |
+
const int ib = (row*ncols + col)/qk; // x block index
|
| 125 |
+
const int iqs = (col%qk)/qr; // x quant index
|
| 126 |
+
const int iybs = col - col%qk; // y block start index
|
| 127 |
+
|
| 128 |
+
// processing >2 values per i iter is faster for fast GPUs
|
| 129 |
+
#pragma unroll
|
| 130 |
+
for (int j = 0; j < vals_per_iter; j += 2) {
|
| 131 |
+
// process 2 vals per j iter
|
| 132 |
+
|
| 133 |
+
// dequantize
|
| 134 |
+
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
| 135 |
+
dfloat2 v;
|
| 136 |
+
dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
|
| 137 |
+
|
| 138 |
+
// matrix multiplication
|
| 139 |
+
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
| 140 |
+
#ifdef GGML_SYCL_F16
|
| 141 |
+
dfloat2 t1{y[iybs + iqs + j / qr + 0],
|
| 142 |
+
y[iybs + iqs + j / qr + y_offset]};
|
| 143 |
+
|
| 144 |
+
tmp += v * t1;
|
| 145 |
+
#else
|
| 146 |
+
tmp += v.x() * y[iybs + iqs + j / qr + 0];
|
| 147 |
+
tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
|
| 148 |
+
#endif // GGML_SYCL_F16
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
for (; i < ncols; i += iter_stride) {
|
| 153 |
+
if (tid>=ncols_left/QK4_0) continue;
|
| 154 |
+
const int col = i + vals_per_iter*tid;
|
| 155 |
+
const int ib = (row*ncols + col)/qk; // x block index
|
| 156 |
+
const int iqs = (col%qk)/qr; // x quant index
|
| 157 |
+
const int iybs = col - col%qk; // y block start index
|
| 158 |
+
|
| 159 |
+
// processing >2 values per i iter is faster for fast GPUs
|
| 160 |
+
#pragma unroll
|
| 161 |
+
for (int j = 0; j < vals_per_iter; j += 2) {
|
| 162 |
+
// process 2 vals per j iter
|
| 163 |
+
|
| 164 |
+
// dequantize
|
| 165 |
+
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
| 166 |
+
dfloat2 v;
|
| 167 |
+
dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
|
| 168 |
+
|
| 169 |
+
// matrix multiplication
|
| 170 |
+
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
| 171 |
+
#ifdef GGML_SYCL_F16
|
| 172 |
+
dfloat2 t1{y[iybs + iqs + j / qr + 0],
|
| 173 |
+
y[iybs + iqs + j / qr + y_offset]};
|
| 174 |
+
|
| 175 |
+
tmp += v * t1;
|
| 176 |
+
#else
|
| 177 |
+
tmp += v.x() * y[iybs + iqs + j / qr + 0];
|
| 178 |
+
tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
|
| 179 |
+
#endif // GGML_SYCL_F16
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
// sum up partial sums and write back result
|
| 184 |
+
const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
|
| 185 |
+
for (int mask = mask_start; mask > 0; mask >>= 1) {
|
| 186 |
+
tmp +=
|
| 187 |
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
if (tid == 0) {
|
| 191 |
+
#ifdef GGML_SYCL_F16
|
| 192 |
+
dst[row] = tmp.x() + tmp.y();
|
| 193 |
+
#else
|
| 194 |
+
dst[row] = tmp;
|
| 195 |
+
#endif // GGML_SYCL_F16
|
| 196 |
+
}
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
| 200 |
float *dst, const int ncols,
|
| 201 |
const int nrows,
|
|
|
|
| 864 |
}
|
| 865 |
}
|
| 866 |
|
| 867 |
+
static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y,
|
| 868 |
+
float *dst, const int ncols,
|
| 869 |
+
const int nrows,
|
| 870 |
+
dpct::queue_ptr stream) {
|
| 871 |
+
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
|
| 872 |
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 873 |
+
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
| 874 |
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 875 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 876 |
+
{
|
| 877 |
+
dpct::has_capability_or_fail(stream->get_device(),
|
| 878 |
+
{sycl::aspect::fp16});
|
| 879 |
+
|
| 880 |
+
stream->parallel_for(
|
| 881 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 882 |
+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 883 |
+
dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
|
| 884 |
+
vx, y, dst, ncols, nrows, item_ct1);
|
| 885 |
+
});
|
| 886 |
+
}
|
| 887 |
+
}
|
| 888 |
+
|
| 889 |
|
| 890 |
static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
|
| 891 |
float *dst, const int ncols,
|
|
|
|
| 1080 |
|
| 1081 |
const int64_t ne00 = src0->ne[0];
|
| 1082 |
const int64_t row_diff = row_high - row_low;
|
|
|
|
| 1083 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 1084 |
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
| 1085 |
#ifdef GGML_SYCL_F16
|
|
|
|
| 1093 |
|
| 1094 |
if (src1_convert_f16) {
|
| 1095 |
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
| 1096 |
+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
| 1097 |
GGML_ASSERT(to_fp16_sycl != nullptr);
|
| 1098 |
to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
|
| 1099 |
}
|
|
|
|
| 1103 |
|
| 1104 |
switch (src0->type) {
|
| 1105 |
case GGML_TYPE_Q4_0:
|
| 1106 |
+
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
|
| 1107 |
+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
| 1108 |
+
dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
| 1109 |
+
} else {
|
| 1110 |
+
dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
| 1111 |
+
}
|
| 1112 |
break;
|
| 1113 |
case GGML_TYPE_Q4_1:
|
| 1114 |
dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
|
|
|
| 1151 |
GGML_UNUSED(src1_ddq_i);
|
| 1152 |
GGML_UNUSED(src1_ncols);
|
| 1153 |
GGML_UNUSED(src1_padded_row_size);
|
| 1154 |
+
GGML_UNUSED(ctx);
|
| 1155 |
}
|
ggml/src/ggml-sycl/getrows.cpp
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// MIT license
|
| 3 |
+
// Copyright (C) 2024 Intel Corporation
|
| 4 |
+
// SPDX-License-Identifier: MIT
|
| 5 |
+
//
|
| 6 |
+
|
| 7 |
+
//
|
| 8 |
+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
| 9 |
+
// See https://llvm.org/LICENSE.txt for license information.
|
| 10 |
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
| 11 |
+
//
|
| 12 |
+
|
| 13 |
+
#include "ggml-impl.h"
|
| 14 |
+
#include "common.hpp"
|
| 15 |
+
#include "dequantize.hpp"
|
| 16 |
+
#include "getrows.hpp"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 20 |
+
static void k_get_rows(
|
| 21 |
+
const void * src0, const int32_t * src1, dst_t * dst,
|
| 22 |
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
| 23 |
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
| 24 |
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
| 25 |
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
| 26 |
+
size_t s10, size_t s11, size_t s12,
|
| 27 |
+
const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
|
| 28 |
+
|
| 29 |
+
const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
|
| 30 |
+
item_ct1.get_local_id(2)) *
|
| 31 |
+
2;
|
| 32 |
+
const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
| 33 |
+
item_ct1.get_local_id(1);
|
| 34 |
+
const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
| 35 |
+
item_ct1.get_local_id(0)) /
|
| 36 |
+
ne12;
|
| 37 |
+
const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
| 38 |
+
item_ct1.get_local_id(0)) %
|
| 39 |
+
ne12;
|
| 40 |
+
|
| 41 |
+
if (i00 >= ne00) {
|
| 42 |
+
return;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
| 46 |
+
|
| 47 |
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
| 48 |
+
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
| 49 |
+
|
| 50 |
+
const int ib = i00/qk; // block index
|
| 51 |
+
const int iqs = (i00%qk)/qr; // quant index
|
| 52 |
+
const int iybs = i00 - i00%qk; // dst block start index
|
| 53 |
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
| 54 |
+
|
| 55 |
+
// dequantize
|
| 56 |
+
dfloat2 v;
|
| 57 |
+
dequantize_kernel(src0_row, ib, iqs, v);
|
| 58 |
+
|
| 59 |
+
dst_row[iybs + iqs + 0] = v.x();
|
| 60 |
+
dst_row[iybs + iqs + y_offset] = v.y();
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
template<int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_recorder, typename dst_t>
|
| 64 |
+
static void k_get_rows_reorder(
|
| 65 |
+
const void * src0, const void *src0_dq, const int32_t * src1, dst_t * dst,
|
| 66 |
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
| 67 |
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
| 68 |
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
| 69 |
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
| 70 |
+
size_t s10, size_t s11, size_t s12,
|
| 71 |
+
const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
|
| 72 |
+
|
| 73 |
+
const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
|
| 74 |
+
item_ct1.get_local_id(2)) *
|
| 75 |
+
2;
|
| 76 |
+
const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
| 77 |
+
item_ct1.get_local_id(1);
|
| 78 |
+
const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
| 79 |
+
item_ct1.get_local_id(0)) /
|
| 80 |
+
ne12;
|
| 81 |
+
const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
| 82 |
+
item_ct1.get_local_id(0)) %
|
| 83 |
+
ne12;
|
| 84 |
+
|
| 85 |
+
if (i00 >= ne00) {
|
| 86 |
+
return;
|
| 87 |
+
}
|
| 88 |
+
auto ncols = ne00;
|
| 89 |
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
| 90 |
+
|
| 91 |
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
| 92 |
+
|
| 93 |
+
const int src0_off = i01 * ncols + i00;
|
| 94 |
+
const int ib = src0_off / QK4_0; // block index
|
| 95 |
+
const int iqs = (i00%qk)/qr; // x quant index
|
| 96 |
+
const int iybs = i00 - i00%qk; // dst block start index
|
| 97 |
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
| 98 |
+
|
| 99 |
+
// dequantize
|
| 100 |
+
dfloat2 v;
|
| 101 |
+
dequantize_kernel_recorder((const void *)src0_dq, ib, (const void *)src0, src0_off/2, v);
|
| 102 |
+
|
| 103 |
+
dst_row[iybs + iqs + 0] = v.x();
|
| 104 |
+
dst_row[iybs + iqs + y_offset] = v.y();
|
| 105 |
+
|
| 106 |
+
GGML_UNUSED(nb01);
|
| 107 |
+
GGML_UNUSED(nb02);
|
| 108 |
+
GGML_UNUSED(nb03);
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
template<typename src0_t, typename dst_t>
|
| 112 |
+
static void k_get_rows_float(
|
| 113 |
+
const src0_t * src0, const int32_t * src1, dst_t * dst,
|
| 114 |
+
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
| 115 |
+
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
| 116 |
+
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
| 117 |
+
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
| 118 |
+
size_t s10, size_t s11, size_t s12,
|
| 119 |
+
const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
|
| 120 |
+
|
| 121 |
+
const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
|
| 122 |
+
item_ct1.get_local_id(2);
|
| 123 |
+
const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
| 124 |
+
item_ct1.get_local_id(1);
|
| 125 |
+
const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
| 126 |
+
item_ct1.get_local_id(0)) /
|
| 127 |
+
ne12;
|
| 128 |
+
const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
| 129 |
+
item_ct1.get_local_id(0)) %
|
| 130 |
+
ne12;
|
| 131 |
+
|
| 132 |
+
if (i00 >= ne00) {
|
| 133 |
+
return;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
| 137 |
+
|
| 138 |
+
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
| 139 |
+
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
| 140 |
+
|
| 141 |
+
dst_row[i00] = src0_row[i00];
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
template <int qk, int qr, dequantize_kernel_t dq>
|
| 145 |
+
static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
| 146 |
+
ggml_tensor *dst, const void *src0_dd,
|
| 147 |
+
const int32_t *src1_dd, float *dst_dd,
|
| 148 |
+
queue_ptr stream) {
|
| 149 |
+
|
| 150 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
| 151 |
+
|
| 152 |
+
const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
|
| 153 |
+
const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
|
| 154 |
+
const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
|
| 155 |
+
|
| 156 |
+
// strides in elements
|
| 157 |
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
| 158 |
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
| 159 |
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
| 160 |
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
| 161 |
+
|
| 162 |
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
| 163 |
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
| 164 |
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
| 165 |
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
| 166 |
+
|
| 167 |
+
GGML_ASSERT(ne00 % 2 == 0);
|
| 168 |
+
|
| 169 |
+
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 170 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 171 |
+
k_get_rows<qk, qr, dq>(
|
| 172 |
+
src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
| 173 |
+
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
| 174 |
+
});
|
| 175 |
+
|
| 176 |
+
GGML_UNUSED(dst);
|
| 177 |
+
GGML_UNUSED(ctx);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
template <int qk, int qr, dequantize_kernel_t_reorder dq_reorder>
|
| 181 |
+
static void get_rows_sycl_reorder(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
| 182 |
+
ggml_tensor *dst, const void *src0_dd,
|
| 183 |
+
const int32_t *src1_dd, float *dst_dd,
|
| 184 |
+
queue_ptr stream) {
|
| 185 |
+
|
| 186 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
| 187 |
+
|
| 188 |
+
const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
|
| 189 |
+
const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
|
| 190 |
+
const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
|
| 191 |
+
|
| 192 |
+
// strides in elements
|
| 193 |
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
| 194 |
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
| 195 |
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
| 196 |
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
| 197 |
+
|
| 198 |
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
| 199 |
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
| 200 |
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
| 201 |
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
| 202 |
+
|
| 203 |
+
GGML_ASSERT(ne00 % 2 == 0);
|
| 204 |
+
|
| 205 |
+
const uint8_t* src0_q = (const uint8_t*)src0_dd;
|
| 206 |
+
const size_t ncols = ne00;
|
| 207 |
+
const size_t nrows = ne01;
|
| 208 |
+
const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2);
|
| 209 |
+
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 210 |
+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
|
| 211 |
+
k_get_rows_reorder<qk, qr, dq_reorder>(
|
| 212 |
+
src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
| 213 |
+
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
| 214 |
+
});
|
| 215 |
+
|
| 216 |
+
GGML_UNUSED(dst);
|
| 217 |
+
GGML_UNUSED(ctx);
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
template <typename src0_t>
|
| 222 |
+
static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 223 |
+
const ggml_tensor *src1, ggml_tensor *dst,
|
| 224 |
+
const src0_t *src0_dd, const int32_t *src1_dd,
|
| 225 |
+
float *dst_dd, queue_ptr stream) {
|
| 226 |
+
|
| 227 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
| 228 |
+
|
| 229 |
+
const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
|
| 230 |
+
const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE;
|
| 231 |
+
const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
|
| 232 |
+
|
| 233 |
+
// strides in elements
|
| 234 |
+
//const size_t s0 = nb0 / ggml_element_size(dst);
|
| 235 |
+
const size_t s1 = nb1 / ggml_element_size(dst);
|
| 236 |
+
const size_t s2 = nb2 / ggml_element_size(dst);
|
| 237 |
+
const size_t s3 = nb3 / ggml_element_size(dst);
|
| 238 |
+
|
| 239 |
+
const size_t s10 = nb10 / ggml_element_size(src1);
|
| 240 |
+
const size_t s11 = nb11 / ggml_element_size(src1);
|
| 241 |
+
const size_t s12 = nb12 / ggml_element_size(src1);
|
| 242 |
+
//const size_t s13 = nb13 / ggml_element_size(src1);
|
| 243 |
+
|
| 244 |
+
{
|
| 245 |
+
dpct::has_capability_or_fail(stream->get_device(),
|
| 246 |
+
{sycl::aspect::fp16});
|
| 247 |
+
|
| 248 |
+
stream->parallel_for(
|
| 249 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 250 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 251 |
+
k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
| 252 |
+
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
| 253 |
+
});
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
GGML_UNUSED(dst);
|
| 257 |
+
GGML_UNUSED(ctx);
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 261 |
+
const ggml_tensor *src1, ggml_tensor *dst,
|
| 262 |
+
const float *src0_d, const float *src1_d,
|
| 263 |
+
float *dst_d, const queue_ptr &stream) {
|
| 264 |
+
|
| 265 |
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
| 266 |
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 267 |
+
|
| 268 |
+
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
| 269 |
+
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
| 270 |
+
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
| 271 |
+
|
| 272 |
+
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
| 273 |
+
|
| 274 |
+
switch (src0->type) {
|
| 275 |
+
case GGML_TYPE_F16:
|
| 276 |
+
get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d,
|
| 277 |
+
src1_i32, dst_d, stream);
|
| 278 |
+
break;
|
| 279 |
+
case GGML_TYPE_F32:
|
| 280 |
+
get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 281 |
+
break;
|
| 282 |
+
case GGML_TYPE_Q4_0:
|
| 283 |
+
if (ctx.opt_feature.reorder && dst->op == GGML_OP_MUL_MAT) {
|
| 284 |
+
get_rows_sycl_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 285 |
+
} else {
|
| 286 |
+
get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 287 |
+
}
|
| 288 |
+
break;
|
| 289 |
+
case GGML_TYPE_Q4_1:
|
| 290 |
+
get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 291 |
+
break;
|
| 292 |
+
case GGML_TYPE_Q5_0:
|
| 293 |
+
get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 294 |
+
break;
|
| 295 |
+
case GGML_TYPE_Q5_1:
|
| 296 |
+
get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 297 |
+
break;
|
| 298 |
+
case GGML_TYPE_Q8_0:
|
| 299 |
+
get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 300 |
+
break;
|
| 301 |
+
default:
|
| 302 |
+
// TODO: k-quants
|
| 303 |
+
GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
| 304 |
+
GGML_ABORT("fatal error");
|
| 305 |
+
break;
|
| 306 |
+
}
|
| 307 |
+
}
|
| 308 |
+
|
ggml/src/ggml-sycl/getrows.hpp
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// MIT license
|
| 3 |
+
// Copyright (C) 2024 Intel Corporation
|
| 4 |
+
// SPDX-License-Identifier: MIT
|
| 5 |
+
//
|
| 6 |
+
|
| 7 |
+
//
|
| 8 |
+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
| 9 |
+
// See https://llvm.org/LICENSE.txt for license information.
|
| 10 |
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
| 11 |
+
//
|
| 12 |
+
|
| 13 |
+
#ifndef GGML_SYCL_GETROWS_HPP
|
| 14 |
+
#define GGML_SYCL_GETROWS_HPP
|
| 15 |
+
|
| 16 |
+
#include "common.hpp"
|
| 17 |
+
|
| 18 |
+
void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 19 |
+
const ggml_tensor *src1, ggml_tensor *dst,
|
| 20 |
+
const float *src0_d, const float *src1_d,
|
| 21 |
+
float *dst_d, const queue_ptr &stream);
|
| 22 |
+
|
| 23 |
+
#endif // GGML_SYCL_GETROWS_HPP
|
ggml/src/ggml-sycl/ggml-sycl.cpp
CHANGED
|
@@ -39,9 +39,12 @@
|
|
| 39 |
#include "ggml-sycl/backend.hpp"
|
| 40 |
#include "ggml-sycl/presets.hpp"
|
| 41 |
#include "ggml-sycl/gemm.hpp"
|
|
|
|
|
|
|
| 42 |
|
| 43 |
static bool g_sycl_loaded = false;
|
| 44 |
int g_ggml_sycl_debug = 0;
|
|
|
|
| 45 |
|
| 46 |
static ggml_sycl_device_info ggml_sycl_init() {
|
| 47 |
ggml_sycl_device_info info = {};
|
|
@@ -64,14 +67,18 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|
| 64 |
for (int i = 0; i < info.device_count; ++i) {
|
| 65 |
info.devices[i].vmm = 0;
|
| 66 |
dpct::device_info prop;
|
|
|
|
|
|
|
| 67 |
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
| 68 |
-
prop,
|
| 69 |
|
| 70 |
info.default_tensor_split[i] = total_vram;
|
| 71 |
total_vram += prop.get_global_mem_size();
|
| 72 |
|
| 73 |
info.devices[i].cc =
|
| 74 |
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
|
|
|
|
|
|
| 75 |
|
| 76 |
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
|
| 77 |
}
|
|
@@ -110,6 +117,27 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
|
| 110 |
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
|
| 111 |
}
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
void ggml_backend_sycl_print_sycl_devices() {
|
| 114 |
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
|
| 115 |
int device_count = dpct::dev_mgr::instance().device_count();
|
|
@@ -138,6 +166,8 @@ void ggml_backend_sycl_print_sycl_devices() {
|
|
| 138 |
<< "]";
|
| 139 |
print_device_detail(id, device, device_type.str());
|
| 140 |
}
|
|
|
|
|
|
|
| 141 |
}
|
| 142 |
|
| 143 |
static inline int get_sycl_env(const char *env_name, int default_val) {
|
|
@@ -159,17 +189,21 @@ static void ggml_check_sycl() try {
|
|
| 159 |
|
| 160 |
if (!initialized) {
|
| 161 |
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
|
|
|
| 162 |
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
| 163 |
-
GGML_LOG_INFO("
|
|
|
|
|
|
|
|
|
|
| 164 |
#if defined(GGML_SYCL_FORCE_MMQ)
|
| 165 |
-
GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ:
|
| 166 |
#else
|
| 167 |
-
GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ:
|
| 168 |
#endif
|
| 169 |
#if defined(GGML_SYCL_F16)
|
| 170 |
-
GGML_LOG_INFO("GGML_SYCL_F16: yes\n");
|
| 171 |
#else
|
| 172 |
-
GGML_LOG_INFO("GGML_SYCL_F16: no\n");
|
| 173 |
#endif
|
| 174 |
|
| 175 |
/* NOT REMOVE, keep it for next optimize for XMX.
|
|
@@ -241,19 +275,27 @@ struct ggml_backend_sycl_buffer_context {
|
|
| 241 |
void * dev_ptr = nullptr;
|
| 242 |
queue_ptr stream;
|
| 243 |
std::string name;
|
|
|
|
|
|
|
| 244 |
|
| 245 |
-
|
| 246 |
device(device), dev_ptr(dev_ptr), stream(stream) {
|
| 247 |
check_allow_gpu_index(device);
|
| 248 |
name = (GGML_SYCL_NAME + std::to_string(device));
|
|
|
|
| 249 |
}
|
| 250 |
|
| 251 |
-
|
| 252 |
~ggml_backend_sycl_buffer_context() {
|
| 253 |
if (dev_ptr != nullptr) {
|
| 254 |
ggml_sycl_set_device(device);
|
| 255 |
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
|
| 256 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
}
|
| 258 |
};
|
| 259 |
|
|
@@ -291,6 +333,9 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
| 291 |
return;
|
| 292 |
}
|
| 293 |
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
if (ggml_is_quantized(tensor->type)) {
|
| 296 |
// initialize padding to 0 to avoid possible NaN values
|
|
@@ -316,7 +361,6 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
| 316 |
size_t size) try {
|
| 317 |
|
| 318 |
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
| 319 |
-
|
| 320 |
ggml_sycl_set_device(ctx->device);
|
| 321 |
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
|
| 322 |
SYCL_CHECK(
|
|
@@ -660,32 +704,7 @@ struct ggml_backend_sycl_split_buffer_type_context {
|
|
| 660 |
struct ggml_backend_sycl_split_buffer_context {
|
| 661 |
~ggml_backend_sycl_split_buffer_context() try {
|
| 662 |
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
| 663 |
-
|
| 664 |
-
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
|
| 665 |
-
if (extra->events[i][is] != nullptr) {
|
| 666 |
-
/*
|
| 667 |
-
DPCT1009:206: SYCL uses exceptions to report errors and
|
| 668 |
-
does not use the error codes. The original code was
|
| 669 |
-
commented out and a warning string was inserted. You
|
| 670 |
-
need to rewrite this code.
|
| 671 |
-
*/
|
| 672 |
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
| 673 |
-
dpct::destroy_event(extra->events[i][is])));
|
| 674 |
-
}
|
| 675 |
-
}
|
| 676 |
-
if (extra->data_device[i] != nullptr) {
|
| 677 |
-
/*
|
| 678 |
-
DPCT1009:207: SYCL uses exceptions to report errors and does
|
| 679 |
-
not use the error codes. The original code was commented out
|
| 680 |
-
and a warning string was inserted. You need to rewrite this
|
| 681 |
-
code.
|
| 682 |
-
*/
|
| 683 |
-
ggml_sycl_set_device(i);
|
| 684 |
-
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
|
| 685 |
-
extra->data_device[i], *(streams[i]))));
|
| 686 |
-
}
|
| 687 |
-
}
|
| 688 |
-
delete extra;
|
| 689 |
}
|
| 690 |
}
|
| 691 |
catch (sycl::exception const &exc) {
|
|
@@ -723,7 +742,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
| 723 |
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
| 724 |
|
| 725 |
ctx->tensor_extras.push_back(extra);
|
| 726 |
-
|
| 727 |
|
| 728 |
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
| 729 |
int64_t row_low, row_high;
|
|
@@ -1337,83 +1356,6 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
|
| 1337 |
reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
|
| 1338 |
}
|
| 1339 |
|
| 1340 |
-
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 1341 |
-
static void k_get_rows(
|
| 1342 |
-
const void * src0, const int32_t * src1, dst_t * dst,
|
| 1343 |
-
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
| 1344 |
-
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
| 1345 |
-
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
| 1346 |
-
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
| 1347 |
-
size_t s10, size_t s11, size_t s12,
|
| 1348 |
-
const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
|
| 1349 |
-
|
| 1350 |
-
const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
|
| 1351 |
-
item_ct1.get_local_id(2)) *
|
| 1352 |
-
2;
|
| 1353 |
-
const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
| 1354 |
-
item_ct1.get_local_id(1);
|
| 1355 |
-
const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
| 1356 |
-
item_ct1.get_local_id(0)) /
|
| 1357 |
-
ne12;
|
| 1358 |
-
const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
| 1359 |
-
item_ct1.get_local_id(0)) %
|
| 1360 |
-
ne12;
|
| 1361 |
-
|
| 1362 |
-
if (i00 >= ne00) {
|
| 1363 |
-
return;
|
| 1364 |
-
}
|
| 1365 |
-
|
| 1366 |
-
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
| 1367 |
-
|
| 1368 |
-
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
| 1369 |
-
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
| 1370 |
-
|
| 1371 |
-
const int ib = i00/qk; // block index
|
| 1372 |
-
const int iqs = (i00%qk)/qr; // quant index
|
| 1373 |
-
const int iybs = i00 - i00%qk; // dst block start index
|
| 1374 |
-
const int y_offset = qr == 1 ? 1 : qk/2;
|
| 1375 |
-
|
| 1376 |
-
// dequantize
|
| 1377 |
-
dfloat2 v;
|
| 1378 |
-
dequantize_kernel(src0_row, ib, iqs, v);
|
| 1379 |
-
|
| 1380 |
-
dst_row[iybs + iqs + 0] = v.x();
|
| 1381 |
-
dst_row[iybs + iqs + y_offset] = v.y();
|
| 1382 |
-
}
|
| 1383 |
-
|
| 1384 |
-
template<typename src0_t, typename dst_t>
|
| 1385 |
-
static void k_get_rows_float(
|
| 1386 |
-
const src0_t * src0, const int32_t * src1, dst_t * dst,
|
| 1387 |
-
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
| 1388 |
-
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
| 1389 |
-
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
| 1390 |
-
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
| 1391 |
-
size_t s10, size_t s11, size_t s12,
|
| 1392 |
-
const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
|
| 1393 |
-
|
| 1394 |
-
const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
|
| 1395 |
-
item_ct1.get_local_id(2);
|
| 1396 |
-
const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
| 1397 |
-
item_ct1.get_local_id(1);
|
| 1398 |
-
const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
| 1399 |
-
item_ct1.get_local_id(0)) /
|
| 1400 |
-
ne12;
|
| 1401 |
-
const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
| 1402 |
-
item_ct1.get_local_id(0)) %
|
| 1403 |
-
ne12;
|
| 1404 |
-
|
| 1405 |
-
if (i00 >= ne00) {
|
| 1406 |
-
return;
|
| 1407 |
-
}
|
| 1408 |
-
|
| 1409 |
-
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
| 1410 |
-
|
| 1411 |
-
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
| 1412 |
-
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
| 1413 |
-
|
| 1414 |
-
dst_row[i00] = src0_row[i00];
|
| 1415 |
-
}
|
| 1416 |
-
|
| 1417 |
static void mul_mat_p021_f16_f32(
|
| 1418 |
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
| 1419 |
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
|
|
@@ -1896,81 +1838,6 @@ static void pool2d_nchw_kernel(
|
|
| 1896 |
o_ptr[cur_oh * ow + cur_ow] = res;
|
| 1897 |
}
|
| 1898 |
|
| 1899 |
-
template <int qk, int qr, dequantize_kernel_t dq>
|
| 1900 |
-
static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
| 1901 |
-
ggml_tensor *dst, const void *src0_dd,
|
| 1902 |
-
const int32_t *src1_dd, float *dst_dd,
|
| 1903 |
-
queue_ptr stream) {
|
| 1904 |
-
|
| 1905 |
-
GGML_TENSOR_BINARY_OP_LOCALS
|
| 1906 |
-
|
| 1907 |
-
const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
|
| 1908 |
-
const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
|
| 1909 |
-
const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
|
| 1910 |
-
|
| 1911 |
-
// strides in elements
|
| 1912 |
-
//const size_t s0 = nb0 / ggml_element_size(dst);
|
| 1913 |
-
const size_t s1 = nb1 / ggml_element_size(dst);
|
| 1914 |
-
const size_t s2 = nb2 / ggml_element_size(dst);
|
| 1915 |
-
const size_t s3 = nb3 / ggml_element_size(dst);
|
| 1916 |
-
|
| 1917 |
-
const size_t s10 = nb10 / ggml_element_size(src1);
|
| 1918 |
-
const size_t s11 = nb11 / ggml_element_size(src1);
|
| 1919 |
-
const size_t s12 = nb12 / ggml_element_size(src1);
|
| 1920 |
-
//const size_t s13 = nb13 / ggml_element_size(src1);
|
| 1921 |
-
|
| 1922 |
-
GGML_ASSERT(ne00 % 2 == 0);
|
| 1923 |
-
|
| 1924 |
-
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1925 |
-
[=](sycl::nd_item<3> item_ct1) {
|
| 1926 |
-
k_get_rows<qk, qr, dq>(
|
| 1927 |
-
src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
| 1928 |
-
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
| 1929 |
-
});
|
| 1930 |
-
|
| 1931 |
-
GGML_UNUSED(dst);
|
| 1932 |
-
GGML_UNUSED(ctx);
|
| 1933 |
-
}
|
| 1934 |
-
|
| 1935 |
-
template <typename src0_t>
|
| 1936 |
-
static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 1937 |
-
const ggml_tensor *src1, ggml_tensor *dst,
|
| 1938 |
-
const src0_t *src0_dd, const int32_t *src1_dd,
|
| 1939 |
-
float *dst_dd, queue_ptr stream) {
|
| 1940 |
-
|
| 1941 |
-
GGML_TENSOR_BINARY_OP_LOCALS
|
| 1942 |
-
|
| 1943 |
-
const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
|
| 1944 |
-
const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE;
|
| 1945 |
-
const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
|
| 1946 |
-
|
| 1947 |
-
// strides in elements
|
| 1948 |
-
//const size_t s0 = nb0 / ggml_element_size(dst);
|
| 1949 |
-
const size_t s1 = nb1 / ggml_element_size(dst);
|
| 1950 |
-
const size_t s2 = nb2 / ggml_element_size(dst);
|
| 1951 |
-
const size_t s3 = nb3 / ggml_element_size(dst);
|
| 1952 |
-
|
| 1953 |
-
const size_t s10 = nb10 / ggml_element_size(src1);
|
| 1954 |
-
const size_t s11 = nb11 / ggml_element_size(src1);
|
| 1955 |
-
const size_t s12 = nb12 / ggml_element_size(src1);
|
| 1956 |
-
//const size_t s13 = nb13 / ggml_element_size(src1);
|
| 1957 |
-
|
| 1958 |
-
{
|
| 1959 |
-
dpct::has_capability_or_fail(stream->get_device(),
|
| 1960 |
-
{sycl::aspect::fp16});
|
| 1961 |
-
|
| 1962 |
-
stream->parallel_for(
|
| 1963 |
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 1964 |
-
[=](sycl::nd_item<3> item_ct1) {
|
| 1965 |
-
k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
| 1966 |
-
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
| 1967 |
-
});
|
| 1968 |
-
}
|
| 1969 |
-
|
| 1970 |
-
GGML_UNUSED(dst);
|
| 1971 |
-
GGML_UNUSED(ctx);
|
| 1972 |
-
}
|
| 1973 |
-
|
| 1974 |
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
| 1975 |
const int ky, const int kx_padded,
|
| 1976 |
queue_ptr stream) {
|
|
@@ -2494,52 +2361,6 @@ catch (sycl::exception const &exc) {
|
|
| 2494 |
std::exit(1);
|
| 2495 |
}
|
| 2496 |
|
| 2497 |
-
static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 2498 |
-
const ggml_tensor *src1, ggml_tensor *dst,
|
| 2499 |
-
const float *src0_d, const float *src1_d,
|
| 2500 |
-
float *dst_d, const queue_ptr &stream) {
|
| 2501 |
-
|
| 2502 |
-
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
| 2503 |
-
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 2504 |
-
|
| 2505 |
-
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
| 2506 |
-
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
| 2507 |
-
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
| 2508 |
-
|
| 2509 |
-
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
| 2510 |
-
|
| 2511 |
-
switch (src0->type) {
|
| 2512 |
-
case GGML_TYPE_F16:
|
| 2513 |
-
get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d,
|
| 2514 |
-
src1_i32, dst_d, stream);
|
| 2515 |
-
break;
|
| 2516 |
-
case GGML_TYPE_F32:
|
| 2517 |
-
get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 2518 |
-
break;
|
| 2519 |
-
case GGML_TYPE_Q4_0:
|
| 2520 |
-
get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 2521 |
-
break;
|
| 2522 |
-
case GGML_TYPE_Q4_1:
|
| 2523 |
-
get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 2524 |
-
break;
|
| 2525 |
-
case GGML_TYPE_Q5_0:
|
| 2526 |
-
get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 2527 |
-
break;
|
| 2528 |
-
case GGML_TYPE_Q5_1:
|
| 2529 |
-
get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 2530 |
-
break;
|
| 2531 |
-
case GGML_TYPE_Q8_0:
|
| 2532 |
-
get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
| 2533 |
-
break;
|
| 2534 |
-
default:
|
| 2535 |
-
// TODO: k-quants
|
| 2536 |
-
GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
| 2537 |
-
GGML_ABORT("fatal error");
|
| 2538 |
-
break;
|
| 2539 |
-
}
|
| 2540 |
-
}
|
| 2541 |
-
|
| 2542 |
-
|
| 2543 |
static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 2544 |
const ggml_tensor *src1, ggml_tensor *dst,
|
| 2545 |
const float *src0_d, const float *src1_d,
|
|
@@ -2589,11 +2410,10 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
| 2589 |
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
| 2590 |
use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
|
| 2591 |
dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 2592 |
-
|
| 2593 |
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
|
| 2594 |
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
|
| 2595 |
if (src0->type != GGML_TYPE_F16) {
|
| 2596 |
-
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type);
|
| 2597 |
GGML_ASSERT(to_fp16_sycl != nullptr);
|
| 2598 |
size_t ne = row_diff*ne00;
|
| 2599 |
src0_as_f16.alloc(ne);
|
|
@@ -2605,7 +2425,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
| 2605 |
|
| 2606 |
ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
|
| 2607 |
if (src1->type != GGML_TYPE_F16) {
|
| 2608 |
-
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
|
| 2609 |
GGML_ASSERT(to_fp16_sycl != nullptr);
|
| 2610 |
size_t ne = src1_ncols*ne10;
|
| 2611 |
src1_as_f16.alloc(ne);
|
|
@@ -2626,13 +2446,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
| 2626 |
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
| 2627 |
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
| 2628 |
dpct::library_data_t::real_half)));
|
| 2629 |
-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
| 2630 |
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
| 2631 |
#else
|
| 2632 |
auto dnnl_stream = ctx.stream_dnnl(stream);
|
| 2633 |
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
| 2634 |
src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
|
| 2635 |
-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
| 2636 |
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
| 2637 |
#endif
|
| 2638 |
}
|
|
@@ -2641,13 +2461,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
| 2641 |
ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
|
| 2642 |
ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
|
| 2643 |
if (src0->type != GGML_TYPE_F32) {
|
| 2644 |
-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type);
|
| 2645 |
GGML_ASSERT(to_fp32_sycl != nullptr);
|
| 2646 |
src0_ddq_as_f32.alloc(row_diff*ne00);
|
| 2647 |
to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
| 2648 |
}
|
| 2649 |
if (src1->type != GGML_TYPE_F32) {
|
| 2650 |
-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type);
|
| 2651 |
GGML_ASSERT(to_fp32_sycl != nullptr);
|
| 2652 |
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
| 2653 |
to_fp32_sycl(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
|
|
@@ -3085,7 +2905,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
| 3085 |
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
| 3086 |
const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
|
| 3087 |
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
| 3088 |
-
|
| 3089 |
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
| 3090 |
if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
|
| 3091 |
continue;
|
|
@@ -3393,7 +3212,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
|
| 3393 |
// convert src1 to fp16
|
| 3394 |
ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
|
| 3395 |
if (src1->type != GGML_TYPE_F16) {
|
| 3396 |
-
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
|
| 3397 |
const int64_t ne_src1 = ggml_nelements(src1);
|
| 3398 |
src1_f16_alloc.alloc(ne_src1);
|
| 3399 |
GGML_ASSERT(to_fp16_sycl != nullptr);
|
|
@@ -3509,6 +3328,7 @@ bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
|
| 3509 |
}
|
| 3510 |
|
| 3511 |
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3512 |
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
| 3513 |
int64_t min_compute_capability = INT_MAX;
|
| 3514 |
|
|
@@ -3570,6 +3390,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 3570 |
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
| 3571 |
} else if (use_dequantize_mul_mat_vec) {
|
| 3572 |
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
|
|
|
|
| 3573 |
} else if (use_mul_mat_vec_q) {
|
| 3574 |
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
| 3575 |
} else if (use_mul_mat_q) {
|
|
@@ -4251,10 +4072,72 @@ catch (sycl::exception const &exc) {
|
|
| 4251 |
std::exit(1);
|
| 4252 |
}
|
| 4253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4254 |
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 4255 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4256 |
ggml_sycl_set_main_device(sycl_ctx->device);
|
| 4257 |
|
|
|
|
| 4258 |
|
| 4259 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 4260 |
ggml_tensor * node = cgraph->nodes[i];
|
|
|
|
| 39 |
#include "ggml-sycl/backend.hpp"
|
| 40 |
#include "ggml-sycl/presets.hpp"
|
| 41 |
#include "ggml-sycl/gemm.hpp"
|
| 42 |
+
#include "ggml-sycl/sycl_hw.hpp"
|
| 43 |
+
#include "ggml-sycl/getrows.hpp"
|
| 44 |
|
| 45 |
static bool g_sycl_loaded = false;
|
| 46 |
int g_ggml_sycl_debug = 0;
|
| 47 |
+
int g_ggml_sycl_disable_optimize = 0;
|
| 48 |
|
| 49 |
static ggml_sycl_device_info ggml_sycl_init() {
|
| 50 |
ggml_sycl_device_info info = {};
|
|
|
|
| 67 |
for (int i = 0; i < info.device_count; ++i) {
|
| 68 |
info.devices[i].vmm = 0;
|
| 69 |
dpct::device_info prop;
|
| 70 |
+
sycl::device device = dpct::dev_mgr::instance().get_device(i);
|
| 71 |
+
|
| 72 |
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
| 73 |
+
prop, device)));
|
| 74 |
|
| 75 |
info.default_tensor_split[i] = total_vram;
|
| 76 |
total_vram += prop.get_global_mem_size();
|
| 77 |
|
| 78 |
info.devices[i].cc =
|
| 79 |
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
| 80 |
+
info.devices[i].hw_info = get_device_hw_info(&device);
|
| 81 |
+
info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch);
|
| 82 |
|
| 83 |
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
|
| 84 |
}
|
|
|
|
| 117 |
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
|
| 118 |
}
|
| 119 |
|
| 120 |
+
void print_device_opt_feature(int device_count) {
|
| 121 |
+
GGML_LOG_INFO("SYCL Optimization Feature:\n");
|
| 122 |
+
GGML_LOG_INFO(
|
| 123 |
+
"|ID| Device Type|Reorder|\n");
|
| 124 |
+
GGML_LOG_INFO(
|
| 125 |
+
"|--|-------------------|-------|\n");
|
| 126 |
+
std::map<std::string, size_t> DeviceNums;
|
| 127 |
+
for (int id = 0; id < device_count; ++id) {
|
| 128 |
+
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
| 129 |
+
std::string backend_type = get_device_backend_and_type(device);
|
| 130 |
+
int type_id = DeviceNums[backend_type]++;
|
| 131 |
+
std::stringstream device_type;
|
| 132 |
+
device_type << "[" << backend_type << ":" << std::to_string(type_id)
|
| 133 |
+
<< "]";
|
| 134 |
+
std::string device_type_s = device_type.str();
|
| 135 |
+
device_type_s = std::regex_replace(device_type_s, std::regex("ext_oneapi_"), "");
|
| 136 |
+
GGML_LOG_INFO("|%2d|%19s|%7s|\n", id, device_type_s.c_str(),
|
| 137 |
+
ggml_sycl_info().devices[id].opt_feature.reorder ? "Y": "N");
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
}
|
| 141 |
void ggml_backend_sycl_print_sycl_devices() {
|
| 142 |
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
|
| 143 |
int device_count = dpct::dev_mgr::instance().device_count();
|
|
|
|
| 166 |
<< "]";
|
| 167 |
print_device_detail(id, device, device_type.str());
|
| 168 |
}
|
| 169 |
+
|
| 170 |
+
print_device_opt_feature(device_count);
|
| 171 |
}
|
| 172 |
|
| 173 |
static inline int get_sycl_env(const char *env_name, int default_val) {
|
|
|
|
| 189 |
|
| 190 |
if (!initialized) {
|
| 191 |
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
| 192 |
+
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
| 193 |
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
| 194 |
+
GGML_LOG_INFO("Running with Environment Variables:\n");
|
| 195 |
+
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
| 196 |
+
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
|
| 197 |
+
GGML_LOG_INFO("Build with Macros:\n");
|
| 198 |
#if defined(GGML_SYCL_FORCE_MMQ)
|
| 199 |
+
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
|
| 200 |
#else
|
| 201 |
+
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: no\n");
|
| 202 |
#endif
|
| 203 |
#if defined(GGML_SYCL_F16)
|
| 204 |
+
GGML_LOG_INFO(" GGML_SYCL_F16: yes\n");
|
| 205 |
#else
|
| 206 |
+
GGML_LOG_INFO(" GGML_SYCL_F16: no\n");
|
| 207 |
#endif
|
| 208 |
|
| 209 |
/* NOT REMOVE, keep it for next optimize for XMX.
|
|
|
|
| 275 |
void * dev_ptr = nullptr;
|
| 276 |
queue_ptr stream;
|
| 277 |
std::string name;
|
| 278 |
+
optimize_feature opt_feature;
|
| 279 |
+
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
| 280 |
|
| 281 |
+
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
|
| 282 |
device(device), dev_ptr(dev_ptr), stream(stream) {
|
| 283 |
check_allow_gpu_index(device);
|
| 284 |
name = (GGML_SYCL_NAME + std::to_string(device));
|
| 285 |
+
opt_feature = ggml_sycl_info().devices[device].opt_feature;
|
| 286 |
}
|
| 287 |
|
|
|
|
| 288 |
~ggml_backend_sycl_buffer_context() {
|
| 289 |
if (dev_ptr != nullptr) {
|
| 290 |
ggml_sycl_set_device(device);
|
| 291 |
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
|
| 292 |
}
|
| 293 |
+
|
| 294 |
+
//release extra used by tensors
|
| 295 |
+
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
| 296 |
+
release_extra_gpu(extra);
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
}
|
| 300 |
};
|
| 301 |
|
|
|
|
| 333 |
return;
|
| 334 |
}
|
| 335 |
|
| 336 |
+
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
| 337 |
+
tensor->extra = extra;
|
| 338 |
+
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
| 339 |
|
| 340 |
if (ggml_is_quantized(tensor->type)) {
|
| 341 |
// initialize padding to 0 to avoid possible NaN values
|
|
|
|
| 361 |
size_t size) try {
|
| 362 |
|
| 363 |
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
|
|
|
| 364 |
ggml_sycl_set_device(ctx->device);
|
| 365 |
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
|
| 366 |
SYCL_CHECK(
|
|
|
|
| 704 |
struct ggml_backend_sycl_split_buffer_context {
|
| 705 |
~ggml_backend_sycl_split_buffer_context() try {
|
| 706 |
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
| 707 |
+
release_extra_gpu(extra, streams);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 708 |
}
|
| 709 |
}
|
| 710 |
catch (sycl::exception const &exc) {
|
|
|
|
| 742 |
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
| 743 |
|
| 744 |
ctx->tensor_extras.push_back(extra);
|
| 745 |
+
ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
|
| 746 |
|
| 747 |
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
| 748 |
int64_t row_low, row_high;
|
|
|
|
| 1356 |
reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
|
| 1357 |
}
|
| 1358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1359 |
static void mul_mat_p021_f16_f32(
|
| 1360 |
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
| 1361 |
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
|
|
|
|
| 1838 |
o_ptr[cur_oh * ow + cur_ow] = res;
|
| 1839 |
}
|
| 1840 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1841 |
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
| 1842 |
const int ky, const int kx_padded,
|
| 1843 |
queue_ptr stream) {
|
|
|
|
| 2361 |
std::exit(1);
|
| 2362 |
}
|
| 2363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2364 |
static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
| 2365 |
const ggml_tensor *src1, ggml_tensor *dst,
|
| 2366 |
const float *src0_d, const float *src1_d,
|
|
|
|
| 2410 |
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
| 2411 |
use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
|
| 2412 |
dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
|
|
| 2413 |
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
|
| 2414 |
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
|
| 2415 |
if (src0->type != GGML_TYPE_F16) {
|
| 2416 |
+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
|
| 2417 |
GGML_ASSERT(to_fp16_sycl != nullptr);
|
| 2418 |
size_t ne = row_diff*ne00;
|
| 2419 |
src0_as_f16.alloc(ne);
|
|
|
|
| 2425 |
|
| 2426 |
ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
|
| 2427 |
if (src1->type != GGML_TYPE_F16) {
|
| 2428 |
+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
| 2429 |
GGML_ASSERT(to_fp16_sycl != nullptr);
|
| 2430 |
size_t ne = src1_ncols*ne10;
|
| 2431 |
src1_as_f16.alloc(ne);
|
|
|
|
| 2446 |
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
| 2447 |
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
| 2448 |
dpct::library_data_t::real_half)));
|
| 2449 |
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
| 2450 |
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
| 2451 |
#else
|
| 2452 |
auto dnnl_stream = ctx.stream_dnnl(stream);
|
| 2453 |
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
| 2454 |
src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
|
| 2455 |
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
| 2456 |
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
| 2457 |
#endif
|
| 2458 |
}
|
|
|
|
| 2461 |
ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
|
| 2462 |
ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
|
| 2463 |
if (src0->type != GGML_TYPE_F32) {
|
| 2464 |
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
|
| 2465 |
GGML_ASSERT(to_fp32_sycl != nullptr);
|
| 2466 |
src0_ddq_as_f32.alloc(row_diff*ne00);
|
| 2467 |
to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
| 2468 |
}
|
| 2469 |
if (src1->type != GGML_TYPE_F32) {
|
| 2470 |
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
|
| 2471 |
GGML_ASSERT(to_fp32_sycl != nullptr);
|
| 2472 |
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
| 2473 |
to_fp32_sycl(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
|
|
|
|
| 2905 |
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
| 2906 |
const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
|
| 2907 |
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
|
|
|
| 2908 |
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
| 2909 |
if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
|
| 2910 |
continue;
|
|
|
|
| 3212 |
// convert src1 to fp16
|
| 3213 |
ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
|
| 3214 |
if (src1->type != GGML_TYPE_F16) {
|
| 3215 |
+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
| 3216 |
const int64_t ne_src1 = ggml_nelements(src1);
|
| 3217 |
src1_f16_alloc.alloc(ne_src1);
|
| 3218 |
GGML_ASSERT(to_fp16_sycl != nullptr);
|
|
|
|
| 3328 |
}
|
| 3329 |
|
| 3330 |
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3331 |
+
|
| 3332 |
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
| 3333 |
int64_t min_compute_capability = INT_MAX;
|
| 3334 |
|
|
|
|
| 3390 |
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
| 3391 |
} else if (use_dequantize_mul_mat_vec) {
|
| 3392 |
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
|
| 3393 |
+
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
|
| 3394 |
} else if (use_mul_mat_vec_q) {
|
| 3395 |
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
| 3396 |
} else if (use_mul_mat_q) {
|
|
|
|
| 4072 |
std::exit(1);
|
| 4073 |
}
|
| 4074 |
|
| 4075 |
+
void reorder_qw(char *data_device, const int ncols, const int nrows,
|
| 4076 |
+
size_t size, size_t offset, dpct::queue_ptr stream) {
|
| 4077 |
+
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
| 4078 |
+
SYCL_CHECK(
|
| 4079 |
+
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
| 4080 |
+
.wait()));
|
| 4081 |
+
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
| 4082 |
+
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
| 4083 |
+
int offset_blks = offset / sizeof(block_q4_0);
|
| 4084 |
+
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
|
| 4085 |
+
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
| 4086 |
+
|
| 4087 |
+
stream->parallel_for(
|
| 4088 |
+
size / sizeof(block_q4_0),
|
| 4089 |
+
[=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 4090 |
+
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
| 4091 |
+
const int ib = i;
|
| 4092 |
+
|
| 4093 |
+
for (int j = 0; j < QK4_0/2; j ++)
|
| 4094 |
+
{
|
| 4095 |
+
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
| 4096 |
+
}
|
| 4097 |
+
*(d_ptr + ib) = x[ib].d;
|
| 4098 |
+
});
|
| 4099 |
+
|
| 4100 |
+
sycl::free(tmp_buf, *stream);
|
| 4101 |
+
}
|
| 4102 |
+
|
| 4103 |
+
void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
|
| 4104 |
+
char*data_device = (char*)src0->data;
|
| 4105 |
+
size_t ncols = src0->ne[0];
|
| 4106 |
+
size_t nrows = src0->ne[1];
|
| 4107 |
+
size_t size = ggml_nbytes(src0);
|
| 4108 |
+
|
| 4109 |
+
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
| 4110 |
+
}
|
| 4111 |
+
|
| 4112 |
+
void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
|
| 4113 |
+
ggml_tensor *src0 = dst->src[0];
|
| 4114 |
+
ggml_tensor *src1 = dst->src[1];
|
| 4115 |
+
|
| 4116 |
+
if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
|
| 4117 |
+
src1->ne[2]==1 && src1->ne[3]==1) {
|
| 4118 |
+
reorder_qw(src0, stream);
|
| 4119 |
+
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
|
| 4120 |
+
GGML_ASSERT(extra);
|
| 4121 |
+
extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
|
| 4122 |
+
}
|
| 4123 |
+
}
|
| 4124 |
+
|
| 4125 |
+
void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
|
| 4126 |
+
dpct::queue_ptr stream = ctx->stream();
|
| 4127 |
+
if (ctx->optimized_graph) {
|
| 4128 |
+
return;
|
| 4129 |
+
}
|
| 4130 |
+
ctx->optimized_graph = true;
|
| 4131 |
+
|
| 4132 |
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 4133 |
+
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
|
| 4134 |
+
}
|
| 4135 |
+
}
|
| 4136 |
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 4137 |
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4138 |
ggml_sycl_set_main_device(sycl_ctx->device);
|
| 4139 |
|
| 4140 |
+
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
|
| 4141 |
|
| 4142 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 4143 |
ggml_tensor * node = cgraph->nodes[i];
|
ggml/src/ggml-sycl/sycl_hw.cpp
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "sycl_hw.hpp"
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
|
| 5 |
+
sycl_hw_info res;
|
| 6 |
+
int32_t id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
|
| 7 |
+
res.device_id = id;
|
| 8 |
+
|
| 9 |
+
syclex::architecture arch = device_ptr->get_info<syclex::info::device::architecture>();
|
| 10 |
+
res.arch = arch;
|
| 11 |
+
|
| 12 |
+
return res;
|
| 13 |
+
}
|
ggml/src/ggml-sycl/sycl_hw.hpp
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef SYCL_HW_HPP
|
| 2 |
+
#define SYCL_HW_HPP
|
| 3 |
+
|
| 4 |
+
#include <algorithm>
|
| 5 |
+
#include <stdio.h>
|
| 6 |
+
#include <vector>
|
| 7 |
+
#include <map>
|
| 8 |
+
|
| 9 |
+
#include <sycl/sycl.hpp>
|
| 10 |
+
|
| 11 |
+
namespace syclex = sycl::ext::oneapi::experimental;
|
| 12 |
+
|
| 13 |
+
struct sycl_hw_info {
|
| 14 |
+
syclex::architecture arch;
|
| 15 |
+
int32_t device_id;
|
| 16 |
+
};
|
| 17 |
+
|
| 18 |
+
bool is_in_vector(std::vector<int> &vec, int item);
|
| 19 |
+
|
| 20 |
+
sycl_hw_info get_device_hw_info(sycl::device *device_ptr);
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
#endif // SYCL_HW_HPP
|