Spaces:
Running
Running
AidanBeltonS
commited on
Commit
·
340b830
1
Parent(s):
0534b5d
Add freq factors (llama/7495)
Browse files- ggml-sycl.cpp +57 -37
ggml-sycl.cpp
CHANGED
|
@@ -8830,12 +8830,11 @@ static void rope(
|
|
| 8830 |
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
| 8831 |
}
|
| 8832 |
|
| 8833 |
-
template<typename T, bool has_pos>
|
| 8834 |
static void rope_neox(
|
| 8835 |
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 8836 |
-
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
| 8837 |
-
,
|
| 8838 |
-
const sycl::nd_item<3> &item_ct1) {
|
| 8839 |
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
| 8840 |
item_ct1.get_local_id(1));
|
| 8841 |
|
|
@@ -8863,8 +8862,10 @@ static void rope_neox(
|
|
| 8863 |
float cur_rot = inv_ndims * ic - ib;
|
| 8864 |
|
| 8865 |
const int p = has_pos ? pos[i2] : 0;
|
|
|
|
|
|
|
| 8866 |
const float theta_base =
|
| 8867 |
-
p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
|
| 8868 |
|
| 8869 |
float cos_theta, sin_theta;
|
| 8870 |
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
|
@@ -12413,7 +12414,7 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
| 12413 |
const int32_t *pos, float freq_scale,
|
| 12414 |
int p_delta_rows, float freq_base, float ext_factor,
|
| 12415 |
float attn_factor, rope_corr_dims corr_dims,
|
| 12416 |
-
dpct::queue_ptr stream) {
|
| 12417 |
GGML_ASSERT(ncols % 2 == 0);
|
| 12418 |
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
| 12419 |
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
|
@@ -12423,38 +12424,48 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
| 12423 |
const float inv_ndims = -1.0f / n_dims;
|
| 12424 |
|
| 12425 |
if (pos == nullptr) {
|
| 12426 |
-
/*
|
| 12427 |
-
DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
|
| 12428 |
-
the limit. To get the device limit, query
|
| 12429 |
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
| 12430 |
-
*/
|
| 12431 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 12432 |
{sycl::aspect::fp16});
|
| 12433 |
-
|
| 12434 |
-
|
| 12435 |
-
|
| 12436 |
-
|
| 12437 |
-
|
| 12438 |
-
|
| 12439 |
-
|
| 12440 |
-
|
| 12441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12442 |
} else {
|
| 12443 |
-
/*
|
| 12444 |
-
DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
|
| 12445 |
-
the limit. To get the device limit, query
|
| 12446 |
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
| 12447 |
-
*/
|
| 12448 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 12449 |
{sycl::aspect::fp16});
|
| 12450 |
|
| 12451 |
-
|
| 12452 |
-
|
| 12453 |
-
|
| 12454 |
-
|
| 12455 |
-
|
| 12456 |
-
|
| 12457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12458 |
}
|
| 12459 |
}
|
| 12460 |
|
|
@@ -13986,9 +13997,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
| 13986 |
ggml_tensor *dst, const float *src0_dd,
|
| 13987 |
const float *src1_dd, float *dst_dd,
|
| 13988 |
const dpct::queue_ptr &main_stream) {
|
| 13989 |
-
|
| 13990 |
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
| 13991 |
-
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
| 13992 |
|
| 13993 |
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
| 13994 |
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
@@ -14014,6 +14023,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
| 14014 |
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
| 14015 |
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
| 14016 |
|
|
|
|
| 14017 |
const int32_t * pos = nullptr;
|
| 14018 |
if ((mode & 1) == 0) {
|
| 14019 |
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
|
@@ -14024,6 +14034,16 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
| 14024 |
const bool is_neox = mode & 2;
|
| 14025 |
const bool is_glm = mode & 4;
|
| 14026 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14027 |
rope_corr_dims corr_dims;
|
| 14028 |
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
| 14029 |
|
|
@@ -14035,13 +14055,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
| 14035 |
if (src0->type == GGML_TYPE_F32) {
|
| 14036 |
rope_neox_sycl(
|
| 14037 |
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
| 14038 |
-
attn_factor, corr_dims, main_stream
|
| 14039 |
);
|
| 14040 |
} else if (src0->type == GGML_TYPE_F16) {
|
| 14041 |
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
|
| 14042 |
ne00, n_dims, nrows, pos, freq_scale, ne01,
|
| 14043 |
freq_base, ext_factor, attn_factor, corr_dims,
|
| 14044 |
-
main_stream);
|
| 14045 |
} else {
|
| 14046 |
GGML_ASSERT(false);
|
| 14047 |
}
|
|
|
|
| 8830 |
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
| 8831 |
}
|
| 8832 |
|
| 8833 |
+
template<typename T, bool has_pos, bool has_freq_facs>
|
| 8834 |
static void rope_neox(
|
| 8835 |
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 8836 |
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
|
| 8837 |
+
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
|
|
|
|
| 8838 |
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
| 8839 |
item_ct1.get_local_id(1));
|
| 8840 |
|
|
|
|
| 8862 |
float cur_rot = inv_ndims * ic - ib;
|
| 8863 |
|
| 8864 |
const int p = has_pos ? pos[i2] : 0;
|
| 8865 |
+
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
| 8866 |
+
|
| 8867 |
const float theta_base =
|
| 8868 |
+
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
|
| 8869 |
|
| 8870 |
float cos_theta, sin_theta;
|
| 8871 |
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
|
|
|
| 12414 |
const int32_t *pos, float freq_scale,
|
| 12415 |
int p_delta_rows, float freq_base, float ext_factor,
|
| 12416 |
float attn_factor, rope_corr_dims corr_dims,
|
| 12417 |
+
const float * freq_factors, dpct::queue_ptr stream) {
|
| 12418 |
GGML_ASSERT(ncols % 2 == 0);
|
| 12419 |
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
| 12420 |
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
|
|
|
| 12424 |
const float inv_ndims = -1.0f / n_dims;
|
| 12425 |
|
| 12426 |
if (pos == nullptr) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12427 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 12428 |
{sycl::aspect::fp16});
|
| 12429 |
+
if (freq_factors == nullptr) {
|
| 12430 |
+
stream->parallel_for(
|
| 12431 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 12432 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 12433 |
+
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
| 12434 |
+
p_delta_rows, ext_factor, attn_factor,
|
| 12435 |
+
corr_dims, theta_scale, inv_ndims, freq_factors,
|
| 12436 |
+
item_ct1);
|
| 12437 |
+
});
|
| 12438 |
+
} else {
|
| 12439 |
+
stream->parallel_for(
|
| 12440 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 12441 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 12442 |
+
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
| 12443 |
+
p_delta_rows, ext_factor, attn_factor,
|
| 12444 |
+
corr_dims, theta_scale, inv_ndims, freq_factors,
|
| 12445 |
+
item_ct1);
|
| 12446 |
+
});
|
| 12447 |
+
}
|
| 12448 |
} else {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12449 |
dpct::has_capability_or_fail(stream->get_device(),
|
| 12450 |
{sycl::aspect::fp16});
|
| 12451 |
|
| 12452 |
+
if (freq_factors == nullptr) {
|
| 12453 |
+
stream->parallel_for(
|
| 12454 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 12455 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 12456 |
+
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
| 12457 |
+
p_delta_rows, ext_factor, attn_factor,
|
| 12458 |
+
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
| 12459 |
+
});
|
| 12460 |
+
} else {
|
| 12461 |
+
stream->parallel_for(
|
| 12462 |
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 12463 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 12464 |
+
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
| 12465 |
+
p_delta_rows, ext_factor, attn_factor,
|
| 12466 |
+
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
| 12467 |
+
});
|
| 12468 |
+
}
|
| 12469 |
}
|
| 12470 |
}
|
| 12471 |
|
|
|
|
| 13997 |
ggml_tensor *dst, const float *src0_dd,
|
| 13998 |
const float *src1_dd, float *dst_dd,
|
| 13999 |
const dpct::queue_ptr &main_stream) {
|
| 14000 |
+
const ggml_tensor * src2 = dst->src[2];
|
|
|
|
|
|
|
| 14001 |
|
| 14002 |
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
| 14003 |
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
|
|
| 14023 |
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
| 14024 |
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
| 14025 |
|
| 14026 |
+
const float * freq_factors = nullptr;
|
| 14027 |
const int32_t * pos = nullptr;
|
| 14028 |
if ((mode & 1) == 0) {
|
| 14029 |
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
|
|
|
| 14034 |
const bool is_neox = mode & 2;
|
| 14035 |
const bool is_glm = mode & 4;
|
| 14036 |
|
| 14037 |
+
if (is_neox) {
|
| 14038 |
+
pos = (const int32_t *) src1_dd;
|
| 14039 |
+
|
| 14040 |
+
if (src2 != nullptr) {
|
| 14041 |
+
freq_factors = (const float *) src2->data;
|
| 14042 |
+
}
|
| 14043 |
+
} else {
|
| 14044 |
+
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
|
| 14045 |
+
}
|
| 14046 |
+
|
| 14047 |
rope_corr_dims corr_dims;
|
| 14048 |
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
| 14049 |
|
|
|
|
| 14055 |
if (src0->type == GGML_TYPE_F32) {
|
| 14056 |
rope_neox_sycl(
|
| 14057 |
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
| 14058 |
+
attn_factor, corr_dims, freq_factors, main_stream
|
| 14059 |
);
|
| 14060 |
} else if (src0->type == GGML_TYPE_F16) {
|
| 14061 |
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
|
| 14062 |
ne00, n_dims, nrows, pos, freq_scale, ne01,
|
| 14063 |
freq_base, ext_factor, attn_factor, corr_dims,
|
| 14064 |
+
freq_factors, main_stream);
|
| 14065 |
} else {
|
| 14066 |
GGML_ASSERT(false);
|
| 14067 |
}
|