Spaces:
Sleeping
Sleeping
Commit
·
be0ec58
1
Parent(s):
b38d0f9
Update vulkan rope implementation to support frequency factors (llama/7475)
Browse files- ggml-vulkan.cpp +168 -91
ggml-vulkan.cpp
CHANGED
|
@@ -290,6 +290,7 @@ struct vk_op_rope_neox_push_constants {
|
|
| 290 |
float corr_dims[4];
|
| 291 |
float theta_scale;
|
| 292 |
float inv_ndims;
|
|
|
|
| 293 |
};
|
| 294 |
|
| 295 |
struct vk_op_soft_max_push_constants {
|
|
@@ -1522,8 +1523,8 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
| 1522 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
| 1523 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
| 1524 |
|
| 1525 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main",
|
| 1526 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main",
|
| 1527 |
|
| 1528 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
| 1529 |
}
|
|
@@ -3732,7 +3733,7 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
| 3732 |
}
|
| 3733 |
|
| 3734 |
|
| 3735 |
-
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) {
|
| 3736 |
switch (op) {
|
| 3737 |
case GGML_OP_ADD:
|
| 3738 |
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
@@ -3853,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
| 3853 |
default:
|
| 3854 |
return nullptr;
|
| 3855 |
}
|
|
|
|
|
|
|
| 3856 |
}
|
| 3857 |
|
| 3858 |
static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
|
@@ -3880,12 +3883,15 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
| 3880 |
}
|
| 3881 |
|
| 3882 |
template<typename PC>
|
| 3883 |
-
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
| 3884 |
#ifdef GGML_VULKAN_DEBUG
|
| 3885 |
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
| 3886 |
if (src1 != nullptr) {
|
| 3887 |
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
| 3888 |
}
|
|
|
|
|
|
|
|
|
|
| 3889 |
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
| 3890 |
#endif
|
| 3891 |
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
|
@@ -3896,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3896 |
const uint64_t ne02 = src0->ne[2];
|
| 3897 |
const uint64_t ne03 = src0->ne[3];
|
| 3898 |
const uint64_t ne0 = ne00 * ne01;
|
|
|
|
| 3899 |
const bool use_src1 = src1 != nullptr;
|
| 3900 |
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
| 3901 |
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
|
@@ -3904,7 +3911,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3904 |
const uint64_t ne1 = ne10 * ne11;
|
| 3905 |
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
| 3906 |
|
| 3907 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3908 |
ggml_vk_func_t op_func;
|
| 3909 |
|
| 3910 |
if (pipeline == nullptr) {
|
|
@@ -3927,15 +3941,18 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3927 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
| 3928 |
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
| 3929 |
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
|
|
|
| 3930 |
|
| 3931 |
vk_buffer d_X = nullptr;
|
| 3932 |
size_t x_buf_offset = 0;
|
| 3933 |
vk_buffer d_Y = nullptr;
|
| 3934 |
size_t y_buf_offset = 0;
|
| 3935 |
vk_buffer d_Z = nullptr;
|
|
|
|
| 3936 |
|
| 3937 |
bool src0_uma = false;
|
| 3938 |
bool src1_uma = false;
|
|
|
|
| 3939 |
|
| 3940 |
if (ctx->device->uma) {
|
| 3941 |
ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
|
|
@@ -3944,10 +3961,15 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3944 |
ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
|
| 3945 |
src1_uma = d_Y != nullptr;
|
| 3946 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3947 |
}
|
| 3948 |
|
| 3949 |
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
| 3950 |
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
|
|
|
| 3951 |
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
| 3952 |
|
| 3953 |
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
@@ -3970,10 +3992,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3970 |
y_buf_offset = extra_src1->offset;
|
| 3971 |
GGML_ASSERT(d_Y != nullptr);
|
| 3972 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3973 |
|
| 3974 |
if (op_supports_incontiguous) {
|
| 3975 |
x_sz = ggml_nbytes(src0);
|
| 3976 |
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
|
|
|
| 3977 |
d_sz = ggml_nbytes(dst);
|
| 3978 |
|
| 3979 |
if (x_buf_offset + x_sz >= d_X->size) {
|
|
@@ -3982,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3982 |
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
| 3983 |
y_sz = VK_WHOLE_SIZE;
|
| 3984 |
}
|
|
|
|
|
|
|
|
|
|
| 3985 |
if (d_buf_offset + d_sz >= d_D->size) {
|
| 3986 |
d_sz = VK_WHOLE_SIZE;
|
| 3987 |
}
|
|
@@ -4021,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 4021 |
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
| 4022 |
y_sz *= ne12 * ne13;
|
| 4023 |
}
|
|
|
|
|
|
|
|
|
|
| 4024 |
if (d_sz != VK_WHOLE_SIZE) {
|
| 4025 |
d_sz *= ne02 * ne03;
|
| 4026 |
}
|
| 4027 |
}
|
| 4028 |
|
| 4029 |
if (op == GGML_OP_SOFT_MAX) {
|
| 4030 |
-
// Empty src1 is possible
|
| 4031 |
vk_subbuffer subbuf_y;
|
| 4032 |
if (use_src1) {
|
| 4033 |
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
|
@@ -4037,6 +4071,28 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 4037 |
|
| 4038 |
ggml_vk_sync_buffers(subctx);
|
| 4039 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4040 |
} else if (use_src1) {
|
| 4041 |
ggml_vk_sync_buffers(subctx);
|
| 4042 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
@@ -4047,6 +4103,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 4047 |
} else {
|
| 4048 |
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
| 4049 |
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
|
|
|
| 4050 |
|
| 4051 |
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
|
| 4052 |
|
|
@@ -4088,7 +4145,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 4088 |
}
|
| 4089 |
|
| 4090 |
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 4091 |
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
| 4092 |
}
|
| 4093 |
|
| 4094 |
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4096,7 +4153,7 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
| 4096 |
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
| 4097 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4098 |
|
| 4099 |
-
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_GET_ROWS, {
|
| 4100 |
(uint32_t)ggml_nelements(src0),
|
| 4101 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4102 |
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -4111,7 +4168,7 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
| 4111 |
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
| 4112 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4113 |
|
| 4114 |
-
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ADD, {
|
| 4115 |
(uint32_t)ggml_nelements(src0),
|
| 4116 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4117 |
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -4126,7 +4183,7 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
| 4126 |
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
| 4127 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4128 |
|
| 4129 |
-
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_MUL, {
|
| 4130 |
(uint32_t)ggml_nelements(src0),
|
| 4131 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4132 |
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -4141,7 +4198,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co
|
|
| 4141 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 4142 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4143 |
|
| 4144 |
-
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SCALE, {
|
| 4145 |
(uint32_t)ggml_nelements(src0),
|
| 4146 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4147 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
@@ -4154,7 +4211,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
| 4154 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 4155 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4156 |
|
| 4157 |
-
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SQR, {
|
| 4158 |
(uint32_t)ggml_nelements(src0),
|
| 4159 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4160 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
@@ -4168,7 +4225,7 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co
|
|
| 4168 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 4169 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4170 |
|
| 4171 |
-
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CLAMP, {
|
| 4172 |
(uint32_t)ggml_nelements(src0),
|
| 4173 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4174 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
@@ -4183,7 +4240,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
| 4183 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4184 |
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
| 4185 |
|
| 4186 |
-
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, {
|
| 4187 |
(uint32_t)ggml_nelements(src0),
|
| 4188 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4189 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
@@ -4195,21 +4252,21 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
| 4195 |
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
| 4196 |
float * op_params = (float *)dst->op_params;
|
| 4197 |
|
| 4198 |
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
| 4199 |
}
|
| 4200 |
|
| 4201 |
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
| 4202 |
float * op_params = (float *)dst->op_params;
|
| 4203 |
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
| 4204 |
}
|
| 4205 |
|
| 4206 |
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
| 4207 |
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
| 4208 |
}
|
| 4209 |
|
| 4210 |
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
| 4211 |
int32_t * op_params = (int32_t *)dst->op_params;
|
| 4212 |
-
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
| 4213 |
}
|
| 4214 |
|
| 4215 |
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4228,7 +4285,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
| 4228 |
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
| 4229 |
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
| 4230 |
|
| 4231 |
-
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_SOFT_MAX, {
|
| 4232 |
ncols,
|
| 4233 |
src1 != nullptr ? nrows_y : (uint32_t)0,
|
| 4234 |
scale, max_bias,
|
|
@@ -4237,11 +4294,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
| 4237 |
});
|
| 4238 |
}
|
| 4239 |
|
| 4240 |
-
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 4241 |
-
#pragma message("TODO: implement phi3 frequency factors support")
|
| 4242 |
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
| 4243 |
-
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
| 4244 |
-
|
| 4245 |
const int n_dims = ((int32_t *) dst->op_params)[1];
|
| 4246 |
const int mode = ((int32_t *) dst->op_params)[2];
|
| 4247 |
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
@@ -4264,12 +4317,13 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
| 4264 |
if (is_neox) {
|
| 4265 |
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
| 4266 |
const float inv_ndims = -1.0f / n_dims;
|
| 4267 |
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, {
|
| 4268 |
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
| 4269 |
-
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims
|
|
|
|
| 4270 |
});
|
| 4271 |
} else {
|
| 4272 |
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, {
|
| 4273 |
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
| 4274 |
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
| 4275 |
});
|
|
@@ -4292,7 +4346,7 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
| 4292 |
|
| 4293 |
std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
|
| 4294 |
|
| 4295 |
-
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_ARGSORT, {
|
| 4296 |
ncols,
|
| 4297 |
ncols_pad,
|
| 4298 |
op_params[0],
|
|
@@ -5408,6 +5462,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 5408 |
|
| 5409 |
const ggml_tensor * src0 = node->src[0];
|
| 5410 |
const ggml_tensor * src1 = node->src[1];
|
|
|
|
| 5411 |
|
| 5412 |
switch (node->op) {
|
| 5413 |
case GGML_OP_UNARY:
|
|
@@ -5524,7 +5579,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 5524 |
|
| 5525 |
break;
|
| 5526 |
case GGML_OP_ROPE:
|
| 5527 |
-
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
|
| 5528 |
|
| 5529 |
break;
|
| 5530 |
case GGML_OP_ARGSORT:
|
|
@@ -6500,7 +6555,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
|
|
| 6500 |
for (int j = 0; j < level; j++) {
|
| 6501 |
std::cerr << " ";
|
| 6502 |
}
|
| 6503 |
-
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) <<
|
| 6504 |
|
| 6505 |
done.push_back(tensor);
|
| 6506 |
|
|
@@ -6550,7 +6605,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
| 6550 |
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
| 6551 |
void * tensor_data = tensor->data;
|
| 6552 |
|
| 6553 |
-
if (tensor->
|
| 6554 |
const size_t tensor_size = ggml_nbytes(tensor);
|
| 6555 |
tensor_data = malloc(tensor_size);
|
| 6556 |
|
|
@@ -6561,12 +6616,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
| 6561 |
}
|
| 6562 |
|
| 6563 |
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
| 6564 |
-
std::cerr << "tensor=" << tensor << " tensor->
|
| 6565 |
if (tensor->src[0] != nullptr) {
|
| 6566 |
-
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << "
|
| 6567 |
}
|
| 6568 |
if (tensor->src[1] != nullptr) {
|
| 6569 |
-
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << "
|
| 6570 |
}
|
| 6571 |
std::cerr << std::endl << "Result:" << std::endl;
|
| 6572 |
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
|
@@ -6577,43 +6632,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
| 6577 |
std::vector<const ggml_tensor *> done;
|
| 6578 |
ggml_vk_print_graph_origin(tensor, done);
|
| 6579 |
|
| 6580 |
-
if (tensor->
|
| 6581 |
free(tensor_data);
|
| 6582 |
}
|
| 6583 |
}
|
| 6584 |
|
| 6585 |
-
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
| 6586 |
-
return;
|
| 6587 |
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
| 6588 |
-
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
| 6589 |
-
return;
|
| 6590 |
-
}
|
| 6591 |
-
for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
|
| 6592 |
-
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
| 6593 |
-
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
| 6594 |
-
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
| 6595 |
-
float val = 0.0f;
|
| 6596 |
-
if (tensor->type == GGML_TYPE_F32) {
|
| 6597 |
-
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
| 6598 |
-
} else if (tensor->type == GGML_TYPE_F16) {
|
| 6599 |
-
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
| 6600 |
-
}
|
| 6601 |
-
if (std::isnan(val)) {
|
| 6602 |
-
std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
|
| 6603 |
-
std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
| 6604 |
-
std::cerr << std::endl;
|
| 6605 |
-
ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
|
| 6606 |
-
std::cerr << std::endl;
|
| 6607 |
-
std::vector<const ggml_tensor *> done;
|
| 6608 |
-
ggml_vk_print_graph_origin(tensor, done);
|
| 6609 |
-
GGML_ASSERT(false);
|
| 6610 |
-
}
|
| 6611 |
-
}
|
| 6612 |
-
}
|
| 6613 |
-
}
|
| 6614 |
-
}
|
| 6615 |
-
}
|
| 6616 |
-
|
| 6617 |
void * comp_result;
|
| 6618 |
size_t comp_size;
|
| 6619 |
size_t comp_nb[GGML_MAX_DIMS];
|
|
@@ -6637,6 +6660,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6637 |
|
| 6638 |
ggml_tensor * src0 = tensor->src[0];
|
| 6639 |
ggml_tensor * src1 = tensor->src[1];
|
|
|
|
| 6640 |
|
| 6641 |
struct ggml_init_params iparams = {
|
| 6642 |
/*.mem_size =*/ 1024*1024*1024,
|
|
@@ -6666,10 +6690,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6666 |
|
| 6667 |
src0_buffer = malloc(src0_size);
|
| 6668 |
src0_clone->data = src0_buffer;
|
| 6669 |
-
if (src0->
|
| 6670 |
memcpy(src0_clone->data, src0->data, src0_size);
|
| 6671 |
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
| 6672 |
-
} else if (src0->
|
| 6673 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
| 6674 |
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
| 6675 |
uint64_t offset = extra->offset;
|
|
@@ -6700,8 +6724,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6700 |
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 6701 |
ggml_vk_print_tensor(ctx, src0, "src0");
|
| 6702 |
}
|
| 6703 |
-
|
| 6704 |
-
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
|
| 6705 |
}
|
| 6706 |
if (src1 != nullptr) {
|
| 6707 |
src1_clone = ggml_dup_tensor(ggml_ctx, src1);
|
|
@@ -6710,10 +6732,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6710 |
|
| 6711 |
src1_buffer = malloc(src1_size);
|
| 6712 |
src1_clone->data = src1_buffer;
|
| 6713 |
-
if (src1->
|
| 6714 |
memcpy(src1_clone->data, src1->data, src1_size);
|
| 6715 |
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
| 6716 |
-
} else if (src1->
|
| 6717 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
| 6718 |
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
| 6719 |
uint64_t offset = extra->offset;
|
|
@@ -6744,12 +6766,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6744 |
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 6745 |
ggml_vk_print_tensor(ctx, src1, "src1");
|
| 6746 |
std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
|
| 6747 |
-
std::cerr << "src1_clone=" << tensor << " src1_clone->
|
| 6748 |
if (src1->src[0] != nullptr) {
|
| 6749 |
-
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << "
|
| 6750 |
}
|
| 6751 |
if (src1->src[1] != nullptr) {
|
| 6752 |
-
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << "
|
| 6753 |
}
|
| 6754 |
std::cerr << std::endl << "Result:" << std::endl;
|
| 6755 |
ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
|
|
@@ -6760,8 +6782,64 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6760 |
std::vector<const ggml_tensor *> done;
|
| 6761 |
ggml_vk_print_graph_origin(src1_clone, done);
|
| 6762 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6763 |
|
| 6764 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6765 |
}
|
| 6766 |
|
| 6767 |
if (tensor->op == GGML_OP_MUL_MAT) {
|
|
@@ -6799,7 +6877,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6799 |
float attn_factor = ((float *) tensor->op_params)[8];
|
| 6800 |
float beta_fast = ((float *) tensor->op_params)[9];
|
| 6801 |
float beta_slow = ((float *) tensor->op_params)[10];
|
| 6802 |
-
tensor_clone =
|
| 6803 |
} else if (tensor->op == GGML_OP_UNARY) {
|
| 6804 |
switch (ggml_get_unary_op(tensor)) {
|
| 6805 |
case GGML_UNARY_OP_SILU:
|
|
@@ -6847,7 +6925,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6847 |
|
| 6848 |
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
| 6849 |
|
| 6850 |
-
ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
|
| 6851 |
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 6852 |
ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
|
| 6853 |
}
|
|
@@ -6888,7 +6965,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6888 |
|
| 6889 |
void * tensor_data = tensor->data;
|
| 6890 |
|
| 6891 |
-
if (tensor->
|
| 6892 |
size_t tensor_size = ggml_nbytes(tensor);
|
| 6893 |
tensor_data = malloc(tensor_size);
|
| 6894 |
|
|
@@ -6936,12 +7013,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6936 |
|
| 6937 |
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
|
| 6938 |
std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
|
| 6939 |
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
| 6940 |
if (src0 != nullptr) {
|
| 6941 |
-
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
| 6942 |
}
|
| 6943 |
if (src1 != nullptr) {
|
| 6944 |
-
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
| 6945 |
}
|
| 6946 |
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
| 6947 |
std::cerr << std::endl << "Result:" << std::endl;
|
|
@@ -6977,12 +7054,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6977 |
|
| 6978 |
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 6979 |
std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
| 6980 |
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
| 6981 |
if (src0 != nullptr) {
|
| 6982 |
-
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
| 6983 |
}
|
| 6984 |
if (src1 != nullptr) {
|
| 6985 |
-
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
| 6986 |
}
|
| 6987 |
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
| 6988 |
std::cerr << std::endl << "Result:" << std::endl;
|
|
@@ -7001,12 +7078,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 7001 |
|
| 7002 |
if (avg_err > 0.05 || std::isnan(avg_err)) {
|
| 7003 |
std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
| 7004 |
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
| 7005 |
if (src0 != nullptr) {
|
| 7006 |
-
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
| 7007 |
}
|
| 7008 |
if (src1 != nullptr) {
|
| 7009 |
-
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
| 7010 |
}
|
| 7011 |
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
| 7012 |
std::cerr << std::endl << "Result:" << std::endl;
|
|
@@ -7018,14 +7095,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 7018 |
ggml_vk_print_graph_origin(tensor, done);
|
| 7019 |
GGML_ASSERT(false);
|
| 7020 |
} else {
|
| 7021 |
-
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << "
|
| 7022 |
}
|
| 7023 |
|
| 7024 |
free(comp_result);
|
| 7025 |
comp_result = nullptr;
|
| 7026 |
comp_size = 0;
|
| 7027 |
|
| 7028 |
-
if (tensor->
|
| 7029 |
free(tensor_data);
|
| 7030 |
}
|
| 7031 |
}
|
|
|
|
| 290 |
float corr_dims[4];
|
| 291 |
float theta_scale;
|
| 292 |
float inv_ndims;
|
| 293 |
+
uint32_t has_freq_facs;
|
| 294 |
};
|
| 295 |
|
| 296 |
struct vk_op_soft_max_push_constants {
|
|
|
|
| 1523 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
| 1524 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
| 1525 |
|
| 1526 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
| 1527 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
| 1528 |
|
| 1529 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
| 1530 |
}
|
|
|
|
| 3733 |
}
|
| 3734 |
|
| 3735 |
|
| 3736 |
+
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
|
| 3737 |
switch (op) {
|
| 3738 |
case GGML_OP_ADD:
|
| 3739 |
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
|
|
| 3854 |
default:
|
| 3855 |
return nullptr;
|
| 3856 |
}
|
| 3857 |
+
|
| 3858 |
+
GGML_UNUSED(src2);
|
| 3859 |
}
|
| 3860 |
|
| 3861 |
static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
|
|
|
| 3883 |
}
|
| 3884 |
|
| 3885 |
template<typename PC>
|
| 3886 |
+
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
| 3887 |
#ifdef GGML_VULKAN_DEBUG
|
| 3888 |
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
| 3889 |
if (src1 != nullptr) {
|
| 3890 |
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
| 3891 |
}
|
| 3892 |
+
if (src2 != nullptr) {
|
| 3893 |
+
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
| 3894 |
+
}
|
| 3895 |
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
| 3896 |
#endif
|
| 3897 |
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
|
|
|
| 3902 |
const uint64_t ne02 = src0->ne[2];
|
| 3903 |
const uint64_t ne03 = src0->ne[3];
|
| 3904 |
const uint64_t ne0 = ne00 * ne01;
|
| 3905 |
+
|
| 3906 |
const bool use_src1 = src1 != nullptr;
|
| 3907 |
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
| 3908 |
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
|
|
|
| 3911 |
const uint64_t ne1 = ne10 * ne11;
|
| 3912 |
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
| 3913 |
|
| 3914 |
+
const bool use_src2 = src2 != nullptr;
|
| 3915 |
+
const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
|
| 3916 |
+
const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
|
| 3917 |
+
const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
|
| 3918 |
+
const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
|
| 3919 |
+
const uint64_t ne2 = ne20 * ne21;
|
| 3920 |
+
|
| 3921 |
+
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
| 3922 |
ggml_vk_func_t op_func;
|
| 3923 |
|
| 3924 |
if (pipeline == nullptr) {
|
|
|
|
| 3941 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
| 3942 |
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
| 3943 |
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
| 3944 |
+
ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
|
| 3945 |
|
| 3946 |
vk_buffer d_X = nullptr;
|
| 3947 |
size_t x_buf_offset = 0;
|
| 3948 |
vk_buffer d_Y = nullptr;
|
| 3949 |
size_t y_buf_offset = 0;
|
| 3950 |
vk_buffer d_Z = nullptr;
|
| 3951 |
+
size_t z_buf_offset = 0;
|
| 3952 |
|
| 3953 |
bool src0_uma = false;
|
| 3954 |
bool src1_uma = false;
|
| 3955 |
+
bool src2_uma = false;
|
| 3956 |
|
| 3957 |
if (ctx->device->uma) {
|
| 3958 |
ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
|
|
|
|
| 3961 |
ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
|
| 3962 |
src1_uma = d_Y != nullptr;
|
| 3963 |
}
|
| 3964 |
+
if (use_src2) {
|
| 3965 |
+
ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
|
| 3966 |
+
src2_uma = d_Z != nullptr;
|
| 3967 |
+
}
|
| 3968 |
}
|
| 3969 |
|
| 3970 |
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
| 3971 |
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
| 3972 |
+
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
| 3973 |
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
| 3974 |
|
| 3975 |
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
|
|
| 3992 |
y_buf_offset = extra_src1->offset;
|
| 3993 |
GGML_ASSERT(d_Y != nullptr);
|
| 3994 |
}
|
| 3995 |
+
if (use_src2 && !src2_uma) {
|
| 3996 |
+
d_Z = extra_src2->buffer_gpu.lock();
|
| 3997 |
+
z_buf_offset = extra_src2->offset;
|
| 3998 |
+
GGML_ASSERT(d_Z != nullptr);
|
| 3999 |
+
}
|
| 4000 |
|
| 4001 |
if (op_supports_incontiguous) {
|
| 4002 |
x_sz = ggml_nbytes(src0);
|
| 4003 |
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
| 4004 |
+
z_sz = use_src2 ? ggml_nbytes(src2) : 0;
|
| 4005 |
d_sz = ggml_nbytes(dst);
|
| 4006 |
|
| 4007 |
if (x_buf_offset + x_sz >= d_X->size) {
|
|
|
|
| 4010 |
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
| 4011 |
y_sz = VK_WHOLE_SIZE;
|
| 4012 |
}
|
| 4013 |
+
if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
|
| 4014 |
+
z_sz = VK_WHOLE_SIZE;
|
| 4015 |
+
}
|
| 4016 |
if (d_buf_offset + d_sz >= d_D->size) {
|
| 4017 |
d_sz = VK_WHOLE_SIZE;
|
| 4018 |
}
|
|
|
|
| 4052 |
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
| 4053 |
y_sz *= ne12 * ne13;
|
| 4054 |
}
|
| 4055 |
+
if (use_src2 && z_sz != VK_WHOLE_SIZE) {
|
| 4056 |
+
z_sz *= ne22 * ne23;
|
| 4057 |
+
}
|
| 4058 |
if (d_sz != VK_WHOLE_SIZE) {
|
| 4059 |
d_sz *= ne02 * ne03;
|
| 4060 |
}
|
| 4061 |
}
|
| 4062 |
|
| 4063 |
if (op == GGML_OP_SOFT_MAX) {
|
| 4064 |
+
// Empty src1 is possible in soft_max, but the shader needs a buffer
|
| 4065 |
vk_subbuffer subbuf_y;
|
| 4066 |
if (use_src1) {
|
| 4067 |
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
|
|
|
| 4071 |
|
| 4072 |
ggml_vk_sync_buffers(subctx);
|
| 4073 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4074 |
+
} else if (op == GGML_OP_ROPE) {
|
| 4075 |
+
const int mode = ((int32_t *) dst->op_params)[2];
|
| 4076 |
+
const bool is_neox = mode & 2;
|
| 4077 |
+
|
| 4078 |
+
if (is_neox) {
|
| 4079 |
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
| 4080 |
+
vk_subbuffer subbuf_z;
|
| 4081 |
+
if (use_src2) {
|
| 4082 |
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
| 4083 |
+
} else {
|
| 4084 |
+
subbuf_z = { d_X, 0, d_X->size };
|
| 4085 |
+
}
|
| 4086 |
+
|
| 4087 |
+
ggml_vk_sync_buffers(subctx);
|
| 4088 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4089 |
+
} else {
|
| 4090 |
+
ggml_vk_sync_buffers(subctx);
|
| 4091 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4092 |
+
}
|
| 4093 |
+
} else if (use_src2) {
|
| 4094 |
+
ggml_vk_sync_buffers(subctx);
|
| 4095 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4096 |
} else if (use_src1) {
|
| 4097 |
ggml_vk_sync_buffers(subctx);
|
| 4098 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
|
|
| 4103 |
} else {
|
| 4104 |
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
| 4105 |
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
| 4106 |
+
GGML_ASSERT(!use_src2);
|
| 4107 |
|
| 4108 |
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
|
| 4109 |
|
|
|
|
| 4145 |
}
|
| 4146 |
|
| 4147 |
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 4148 |
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
| 4149 |
}
|
| 4150 |
|
| 4151 |
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 4153 |
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
| 4154 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4155 |
|
| 4156 |
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
| 4157 |
(uint32_t)ggml_nelements(src0),
|
| 4158 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4159 |
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
|
|
| 4168 |
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
| 4169 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4170 |
|
| 4171 |
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, {
|
| 4172 |
(uint32_t)ggml_nelements(src0),
|
| 4173 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4174 |
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
|
|
| 4183 |
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
| 4184 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4185 |
|
| 4186 |
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, {
|
| 4187 |
(uint32_t)ggml_nelements(src0),
|
| 4188 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4189 |
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
|
|
| 4198 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 4199 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4200 |
|
| 4201 |
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
|
| 4202 |
(uint32_t)ggml_nelements(src0),
|
| 4203 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4204 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
| 4211 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 4212 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4213 |
|
| 4214 |
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
|
| 4215 |
(uint32_t)ggml_nelements(src0),
|
| 4216 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4217 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
| 4225 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 4226 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4227 |
|
| 4228 |
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
|
| 4229 |
(uint32_t)ggml_nelements(src0),
|
| 4230 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4231 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
| 4240 |
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 4241 |
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
| 4242 |
|
| 4243 |
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
| 4244 |
(uint32_t)ggml_nelements(src0),
|
| 4245 |
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 4246 |
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
| 4252 |
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
| 4253 |
float * op_params = (float *)dst->op_params;
|
| 4254 |
|
| 4255 |
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
| 4256 |
}
|
| 4257 |
|
| 4258 |
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
| 4259 |
float * op_params = (float *)dst->op_params;
|
| 4260 |
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
| 4261 |
}
|
| 4262 |
|
| 4263 |
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
| 4264 |
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
| 4265 |
}
|
| 4266 |
|
| 4267 |
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
| 4268 |
int32_t * op_params = (int32_t *)dst->op_params;
|
| 4269 |
+
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
| 4270 |
}
|
| 4271 |
|
| 4272 |
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 4285 |
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
| 4286 |
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
| 4287 |
|
| 4288 |
+
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
|
| 4289 |
ncols,
|
| 4290 |
src1 != nullptr ? nrows_y : (uint32_t)0,
|
| 4291 |
scale, max_bias,
|
|
|
|
| 4294 |
});
|
| 4295 |
}
|
| 4296 |
|
| 4297 |
+
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4298 |
const int n_dims = ((int32_t *) dst->op_params)[1];
|
| 4299 |
const int mode = ((int32_t *) dst->op_params)[2];
|
| 4300 |
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
|
|
| 4317 |
if (is_neox) {
|
| 4318 |
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
| 4319 |
const float inv_ndims = -1.0f / n_dims;
|
| 4320 |
+
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
| 4321 |
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
| 4322 |
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
|
| 4323 |
+
src2 != nullptr,
|
| 4324 |
});
|
| 4325 |
} else {
|
| 4326 |
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
| 4327 |
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
| 4328 |
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
| 4329 |
});
|
|
|
|
| 4346 |
|
| 4347 |
std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
|
| 4348 |
|
| 4349 |
+
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
|
| 4350 |
ncols,
|
| 4351 |
ncols_pad,
|
| 4352 |
op_params[0],
|
|
|
|
| 5462 |
|
| 5463 |
const ggml_tensor * src0 = node->src[0];
|
| 5464 |
const ggml_tensor * src1 = node->src[1];
|
| 5465 |
+
const ggml_tensor * src2 = node->src[2];
|
| 5466 |
|
| 5467 |
switch (node->op) {
|
| 5468 |
case GGML_OP_UNARY:
|
|
|
|
| 5579 |
|
| 5580 |
break;
|
| 5581 |
case GGML_OP_ROPE:
|
| 5582 |
+
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
|
| 5583 |
|
| 5584 |
break;
|
| 5585 |
case GGML_OP_ARGSORT:
|
|
|
|
| 6555 |
for (int j = 0; j < level; j++) {
|
| 6556 |
std::cerr << " ";
|
| 6557 |
}
|
| 6558 |
+
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
|
| 6559 |
|
| 6560 |
done.push_back(tensor);
|
| 6561 |
|
|
|
|
| 6605 |
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
| 6606 |
void * tensor_data = tensor->data;
|
| 6607 |
|
| 6608 |
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
| 6609 |
const size_t tensor_size = ggml_nbytes(tensor);
|
| 6610 |
tensor_data = malloc(tensor_size);
|
| 6611 |
|
|
|
|
| 6616 |
}
|
| 6617 |
|
| 6618 |
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
| 6619 |
+
std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
| 6620 |
if (tensor->src[0] != nullptr) {
|
| 6621 |
+
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
|
| 6622 |
}
|
| 6623 |
if (tensor->src[1] != nullptr) {
|
| 6624 |
+
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
| 6625 |
}
|
| 6626 |
std::cerr << std::endl << "Result:" << std::endl;
|
| 6627 |
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
|
|
|
| 6632 |
std::vector<const ggml_tensor *> done;
|
| 6633 |
ggml_vk_print_graph_origin(tensor, done);
|
| 6634 |
|
| 6635 |
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
| 6636 |
free(tensor_data);
|
| 6637 |
}
|
| 6638 |
}
|
| 6639 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6640 |
void * comp_result;
|
| 6641 |
size_t comp_size;
|
| 6642 |
size_t comp_nb[GGML_MAX_DIMS];
|
|
|
|
| 6660 |
|
| 6661 |
ggml_tensor * src0 = tensor->src[0];
|
| 6662 |
ggml_tensor * src1 = tensor->src[1];
|
| 6663 |
+
ggml_tensor * src2 = tensor->src[2];
|
| 6664 |
|
| 6665 |
struct ggml_init_params iparams = {
|
| 6666 |
/*.mem_size =*/ 1024*1024*1024,
|
|
|
|
| 6690 |
|
| 6691 |
src0_buffer = malloc(src0_size);
|
| 6692 |
src0_clone->data = src0_buffer;
|
| 6693 |
+
if (ggml_backend_buffer_is_host(src0->buffer)) {
|
| 6694 |
memcpy(src0_clone->data, src0->data, src0_size);
|
| 6695 |
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
| 6696 |
+
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
| 6697 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
| 6698 |
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
| 6699 |
uint64_t offset = extra->offset;
|
|
|
|
| 6724 |
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 6725 |
ggml_vk_print_tensor(ctx, src0, "src0");
|
| 6726 |
}
|
|
|
|
|
|
|
| 6727 |
}
|
| 6728 |
if (src1 != nullptr) {
|
| 6729 |
src1_clone = ggml_dup_tensor(ggml_ctx, src1);
|
|
|
|
| 6732 |
|
| 6733 |
src1_buffer = malloc(src1_size);
|
| 6734 |
src1_clone->data = src1_buffer;
|
| 6735 |
+
if (ggml_backend_buffer_is_host(src1->buffer)) {
|
| 6736 |
memcpy(src1_clone->data, src1->data, src1_size);
|
| 6737 |
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
| 6738 |
+
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
| 6739 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
| 6740 |
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
| 6741 |
uint64_t offset = extra->offset;
|
|
|
|
| 6766 |
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 6767 |
ggml_vk_print_tensor(ctx, src1, "src1");
|
| 6768 |
std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
|
| 6769 |
+
std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
|
| 6770 |
if (src1->src[0] != nullptr) {
|
| 6771 |
+
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
|
| 6772 |
}
|
| 6773 |
if (src1->src[1] != nullptr) {
|
| 6774 |
+
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
|
| 6775 |
}
|
| 6776 |
std::cerr << std::endl << "Result:" << std::endl;
|
| 6777 |
ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
|
|
|
|
| 6782 |
std::vector<const ggml_tensor *> done;
|
| 6783 |
ggml_vk_print_graph_origin(src1_clone, done);
|
| 6784 |
}
|
| 6785 |
+
}
|
| 6786 |
+
if (src2 != nullptr) {
|
| 6787 |
+
src2_clone = ggml_dup_tensor(ggml_ctx, src2);
|
| 6788 |
+
|
| 6789 |
+
src2_size = ggml_nbytes(src2);
|
| 6790 |
+
|
| 6791 |
+
src2_buffer = malloc(src2_size);
|
| 6792 |
+
src2_clone->data = src2_buffer;
|
| 6793 |
+
if (ggml_backend_buffer_is_host(src2->buffer)) {
|
| 6794 |
+
memcpy(src2_clone->data, src2->data, src2_size);
|
| 6795 |
+
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
| 6796 |
+
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
| 6797 |
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
| 6798 |
+
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
| 6799 |
+
uint64_t offset = extra->offset;
|
| 6800 |
+
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
| 6801 |
+
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
| 6802 |
+
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
| 6803 |
+
const int idx = i3*src2->ne[2] + i2;
|
| 6804 |
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
|
| 6805 |
+
}
|
| 6806 |
+
}
|
| 6807 |
+
|
| 6808 |
+
src2_clone->nb[0] = src2->nb[0];
|
| 6809 |
+
src2_clone->nb[1] = src2->nb[1];
|
| 6810 |
+
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
| 6811 |
+
src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
|
| 6812 |
+
}
|
| 6813 |
+
} else {
|
| 6814 |
+
if (offset + src2_size >= buffer_gpu->size) {
|
| 6815 |
+
src2_size = buffer_gpu->size - offset;
|
| 6816 |
+
}
|
| 6817 |
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
|
| 6818 |
+
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
| 6819 |
+
}
|
| 6820 |
+
} else {
|
| 6821 |
+
GGML_ASSERT(false);
|
| 6822 |
+
}
|
| 6823 |
|
| 6824 |
+
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 6825 |
+
ggml_vk_print_tensor(ctx, src2, "src2");
|
| 6826 |
+
std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
|
| 6827 |
+
std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
|
| 6828 |
+
if (src2->src[0] != nullptr) {
|
| 6829 |
+
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
|
| 6830 |
+
}
|
| 6831 |
+
if (src2->src[1] != nullptr) {
|
| 6832 |
+
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
|
| 6833 |
+
}
|
| 6834 |
+
std::cerr << std::endl << "Result:" << std::endl;
|
| 6835 |
+
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
|
| 6836 |
+
std::cerr << std::endl;
|
| 6837 |
+
std::cerr << std::endl << "Result:" << std::endl;
|
| 6838 |
+
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
|
| 6839 |
+
std::cerr << std::endl;
|
| 6840 |
+
std::vector<const ggml_tensor *> done;
|
| 6841 |
+
ggml_vk_print_graph_origin(src2_clone, done);
|
| 6842 |
+
}
|
| 6843 |
}
|
| 6844 |
|
| 6845 |
if (tensor->op == GGML_OP_MUL_MAT) {
|
|
|
|
| 6877 |
float attn_factor = ((float *) tensor->op_params)[8];
|
| 6878 |
float beta_fast = ((float *) tensor->op_params)[9];
|
| 6879 |
float beta_slow = ((float *) tensor->op_params)[10];
|
| 6880 |
+
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
| 6881 |
} else if (tensor->op == GGML_OP_UNARY) {
|
| 6882 |
switch (ggml_get_unary_op(tensor)) {
|
| 6883 |
case GGML_UNARY_OP_SILU:
|
|
|
|
| 6925 |
|
| 6926 |
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
| 6927 |
|
|
|
|
| 6928 |
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 6929 |
ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
|
| 6930 |
}
|
|
|
|
| 6965 |
|
| 6966 |
void * tensor_data = tensor->data;
|
| 6967 |
|
| 6968 |
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
| 6969 |
size_t tensor_size = ggml_nbytes(tensor);
|
| 6970 |
tensor_data = malloc(tensor_size);
|
| 6971 |
|
|
|
|
| 7013 |
|
| 7014 |
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
|
| 7015 |
std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
|
| 7016 |
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
| 7017 |
if (src0 != nullptr) {
|
| 7018 |
+
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
| 7019 |
}
|
| 7020 |
if (src1 != nullptr) {
|
| 7021 |
+
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
| 7022 |
}
|
| 7023 |
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
| 7024 |
std::cerr << std::endl << "Result:" << std::endl;
|
|
|
|
| 7054 |
|
| 7055 |
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 7056 |
std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
| 7057 |
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
| 7058 |
if (src0 != nullptr) {
|
| 7059 |
+
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
| 7060 |
}
|
| 7061 |
if (src1 != nullptr) {
|
| 7062 |
+
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
| 7063 |
}
|
| 7064 |
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
| 7065 |
std::cerr << std::endl << "Result:" << std::endl;
|
|
|
|
| 7078 |
|
| 7079 |
if (avg_err > 0.05 || std::isnan(avg_err)) {
|
| 7080 |
std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
| 7081 |
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
| 7082 |
if (src0 != nullptr) {
|
| 7083 |
+
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
| 7084 |
}
|
| 7085 |
if (src1 != nullptr) {
|
| 7086 |
+
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
| 7087 |
}
|
| 7088 |
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
| 7089 |
std::cerr << std::endl << "Result:" << std::endl;
|
|
|
|
| 7095 |
ggml_vk_print_graph_origin(tensor, done);
|
| 7096 |
GGML_ASSERT(false);
|
| 7097 |
} else {
|
| 7098 |
+
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
|
| 7099 |
}
|
| 7100 |
|
| 7101 |
free(comp_result);
|
| 7102 |
comp_result = nullptr;
|
| 7103 |
comp_size = 0;
|
| 7104 |
|
| 7105 |
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
| 7106 |
free(tensor_data);
|
| 7107 |
}
|
| 7108 |
}
|