Spaces:
Sleeping
Sleeping
Commit
·
71850e7
1
Parent(s):
154bf2b
Update Vulkan RoPE implementation (llama/7818)
Browse files* Update Vulkan RoPE implementation
* Return nullptr on alloc_buffer when allocation fails, instead of throwing an exception
Minor fixes
* Fix segfault when running out of VRAM
Co-authored-by: slaren <[email protected]>
---------
Co-authored-by: slaren <[email protected]>
- ggml-alloc.c +1 -1
- ggml-vulkan.cpp +34 -59
ggml-alloc.c
CHANGED
|
@@ -886,7 +886,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
| 886 |
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
| 887 |
#endif
|
| 888 |
for (size_t i = 0; i < *n_buffers; i++) {
|
| 889 |
-
ggml_backend_buffer_free(*buffers[i]);
|
| 890 |
}
|
| 891 |
free(*buffers);
|
| 892 |
return false;
|
|
|
|
| 886 |
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
| 887 |
#endif
|
| 888 |
for (size_t i = 0; i < *n_buffers; i++) {
|
| 889 |
+
ggml_backend_buffer_free((*buffers)[i]);
|
| 890 |
}
|
| 891 |
free(*buffers);
|
| 892 |
return false;
|
ggml-vulkan.cpp
CHANGED
|
@@ -150,7 +150,7 @@ struct vk_device {
|
|
| 150 |
vk_pipeline pipeline_relu_f32;
|
| 151 |
vk_pipeline pipeline_diag_mask_inf_f32;
|
| 152 |
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
| 153 |
-
vk_pipeline
|
| 154 |
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
| 155 |
vk_pipeline pipeline_argsort_f32;
|
| 156 |
vk_pipeline pipeline_sum_rows_f32;
|
|
@@ -283,26 +283,15 @@ struct vk_op_diag_mask_push_constants {
|
|
| 283 |
|
| 284 |
struct vk_op_rope_push_constants {
|
| 285 |
uint32_t ncols;
|
|
|
|
| 286 |
float freq_scale;
|
| 287 |
uint32_t p_delta_rows;
|
| 288 |
float freq_base;
|
| 289 |
float ext_factor;
|
| 290 |
float attn_factor;
|
| 291 |
-
float corr_dims[
|
| 292 |
-
};
|
| 293 |
-
|
| 294 |
-
struct vk_op_rope_neox_push_constants {
|
| 295 |
-
uint32_t ncols;
|
| 296 |
-
uint32_t ndims;
|
| 297 |
-
float freq_scale;
|
| 298 |
-
uint32_t p_delta_rows;
|
| 299 |
-
float freq_base;
|
| 300 |
-
float ext_factor;
|
| 301 |
-
float attn_factor;
|
| 302 |
-
float corr_dims[4];
|
| 303 |
float theta_scale;
|
| 304 |
-
|
| 305 |
-
uint32_t has_freq_facs;
|
| 306 |
};
|
| 307 |
|
| 308 |
struct vk_op_soft_max_push_constants {
|
|
@@ -1534,11 +1523,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
| 1534 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
| 1535 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
| 1536 |
|
| 1537 |
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
| 1538 |
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
| 1539 |
|
| 1540 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(
|
| 1541 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(
|
| 1542 |
|
| 1543 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
| 1544 |
|
|
@@ -3905,10 +3894,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
| 3905 |
}
|
| 3906 |
} else {
|
| 3907 |
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
| 3908 |
-
return ctx->device->
|
| 3909 |
}
|
| 3910 |
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
| 3911 |
-
return ctx->device->
|
| 3912 |
}
|
| 3913 |
}
|
| 3914 |
return nullptr;
|
|
@@ -4152,24 +4141,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 4152 |
ggml_vk_sync_buffers(subctx);
|
| 4153 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4154 |
} else if (op == GGML_OP_ROPE) {
|
| 4155 |
-
|
| 4156 |
-
|
| 4157 |
-
|
| 4158 |
-
|
| 4159 |
-
// Empty src2 is possible in rope, but the shader needs a buffer
|
| 4160 |
-
vk_subbuffer subbuf_z;
|
| 4161 |
-
if (use_src2) {
|
| 4162 |
-
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
| 4163 |
-
} else {
|
| 4164 |
-
subbuf_z = { d_X, 0, d_X->size };
|
| 4165 |
-
}
|
| 4166 |
-
|
| 4167 |
-
ggml_vk_sync_buffers(subctx);
|
| 4168 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4169 |
} else {
|
| 4170 |
-
|
| 4171 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4172 |
}
|
|
|
|
|
|
|
|
|
|
| 4173 |
} else if (use_src2) {
|
| 4174 |
ggml_vk_sync_buffers(subctx);
|
| 4175 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
@@ -4391,7 +4372,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
| 4391 |
|
| 4392 |
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
| 4393 |
const int n_dims = ((int32_t *) dst->op_params)[1];
|
| 4394 |
-
const int mode = ((int32_t *) dst->op_params)[2];
|
| 4395 |
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
| 4396 |
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
| 4397 |
const float freq_base = ((float *) dst->op_params)[5];
|
|
@@ -4401,28 +4382,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
| 4401 |
const float beta_fast = ((float *) dst->op_params)[9];
|
| 4402 |
const float beta_slow = ((float *) dst->op_params)[10];
|
| 4403 |
|
| 4404 |
-
const bool is_neox = mode & 2;
|
| 4405 |
-
|
| 4406 |
-
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
| 4407 |
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
| 4408 |
-
|
| 4409 |
float corr_dims[2];
|
| 4410 |
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
| 4411 |
|
| 4412 |
-
|
| 4413 |
-
|
| 4414 |
-
|
| 4415 |
-
|
| 4416 |
-
|
| 4417 |
-
|
| 4418 |
-
|
| 4419 |
-
});
|
| 4420 |
-
} else {
|
| 4421 |
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
| 4422 |
-
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
| 4423 |
-
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
| 4424 |
-
});
|
| 4425 |
-
}
|
| 4426 |
}
|
| 4427 |
|
| 4428 |
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -6070,7 +6039,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
|
| 6070 |
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
| 6071 |
#endif
|
| 6072 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
| 6073 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6074 |
|
| 6075 |
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
|
| 6076 |
|
|
@@ -6466,7 +6441,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
| 6466 |
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
| 6467 |
// } break;
|
| 6468 |
case GGML_OP_ROPE:
|
| 6469 |
-
return
|
| 6470 |
case GGML_OP_NONE:
|
| 6471 |
case GGML_OP_RESHAPE:
|
| 6472 |
case GGML_OP_VIEW:
|
|
|
|
| 150 |
vk_pipeline pipeline_relu_f32;
|
| 151 |
vk_pipeline pipeline_diag_mask_inf_f32;
|
| 152 |
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
| 153 |
+
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
|
| 154 |
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
| 155 |
vk_pipeline pipeline_argsort_f32;
|
| 156 |
vk_pipeline pipeline_sum_rows_f32;
|
|
|
|
| 283 |
|
| 284 |
struct vk_op_rope_push_constants {
|
| 285 |
uint32_t ncols;
|
| 286 |
+
uint32_t n_dims;
|
| 287 |
float freq_scale;
|
| 288 |
uint32_t p_delta_rows;
|
| 289 |
float freq_base;
|
| 290 |
float ext_factor;
|
| 291 |
float attn_factor;
|
| 292 |
+
float corr_dims[2];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
float theta_scale;
|
| 294 |
+
uint32_t has_ff;
|
|
|
|
| 295 |
};
|
| 296 |
|
| 297 |
struct vk_op_soft_max_push_constants {
|
|
|
|
| 1523 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
| 1524 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
| 1525 |
|
| 1526 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
| 1527 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
| 1528 |
|
| 1529 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
| 1530 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
| 1531 |
|
| 1532 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
| 1533 |
|
|
|
|
| 3894 |
}
|
| 3895 |
} else {
|
| 3896 |
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
| 3897 |
+
return ctx->device->pipeline_rope_norm_f32;
|
| 3898 |
}
|
| 3899 |
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
| 3900 |
+
return ctx->device->pipeline_rope_norm_f16;
|
| 3901 |
}
|
| 3902 |
}
|
| 3903 |
return nullptr;
|
|
|
|
| 4141 |
ggml_vk_sync_buffers(subctx);
|
| 4142 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4143 |
} else if (op == GGML_OP_ROPE) {
|
| 4144 |
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
| 4145 |
+
vk_subbuffer subbuf_z;
|
| 4146 |
+
if (use_src2) {
|
| 4147 |
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4148 |
} else {
|
| 4149 |
+
subbuf_z = { d_X, 0, d_X->size };
|
|
|
|
| 4150 |
}
|
| 4151 |
+
|
| 4152 |
+
ggml_vk_sync_buffers(subctx);
|
| 4153 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4154 |
} else if (use_src2) {
|
| 4155 |
ggml_vk_sync_buffers(subctx);
|
| 4156 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
|
|
| 4372 |
|
| 4373 |
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
| 4374 |
const int n_dims = ((int32_t *) dst->op_params)[1];
|
| 4375 |
+
// const int mode = ((int32_t *) dst->op_params)[2];
|
| 4376 |
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
| 4377 |
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
| 4378 |
const float freq_base = ((float *) dst->op_params)[5];
|
|
|
|
| 4382 |
const float beta_fast = ((float *) dst->op_params)[9];
|
| 4383 |
const float beta_slow = ((float *) dst->op_params)[10];
|
| 4384 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4385 |
float corr_dims[2];
|
| 4386 |
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
| 4387 |
|
| 4388 |
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
| 4389 |
+
|
| 4390 |
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
| 4391 |
+
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
| 4392 |
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
|
| 4393 |
+
src2 != nullptr,
|
| 4394 |
+
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4395 |
}
|
| 4396 |
|
| 4397 |
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
|
|
| 6039 |
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
| 6040 |
#endif
|
| 6041 |
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
| 6042 |
+
|
| 6043 |
+
vk_buffer dev_buffer = nullptr;
|
| 6044 |
+
try {
|
| 6045 |
+
dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
|
| 6046 |
+
} catch (const vk::SystemError& e) {
|
| 6047 |
+
return nullptr;
|
| 6048 |
+
}
|
| 6049 |
|
| 6050 |
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
|
| 6051 |
|
|
|
|
| 6441 |
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
| 6442 |
// } break;
|
| 6443 |
case GGML_OP_ROPE:
|
| 6444 |
+
return ggml_is_contiguous(op->src[0]);
|
| 6445 |
case GGML_OP_NONE:
|
| 6446 |
case GGML_OP_RESHAPE:
|
| 6447 |
case GGML_OP_VIEW:
|