Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +801 -255
- examples/talk-llama/llama.h +9 -0
- examples/talk-llama/unicode.h +2 -1
examples/talk-llama/llama.cpp
CHANGED
|
@@ -192,8 +192,10 @@ enum llm_arch {
|
|
| 192 |
LLM_ARCH_BLOOM,
|
| 193 |
LLM_ARCH_STABLELM,
|
| 194 |
LLM_ARCH_QWEN,
|
|
|
|
| 195 |
LLM_ARCH_PHI2,
|
| 196 |
LLM_ARCH_PLAMO,
|
|
|
|
| 197 |
LLM_ARCH_UNKNOWN,
|
| 198 |
};
|
| 199 |
|
|
@@ -211,8 +213,10 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
| 211 |
{ LLM_ARCH_BLOOM, "bloom" },
|
| 212 |
{ LLM_ARCH_STABLELM, "stablelm" },
|
| 213 |
{ LLM_ARCH_QWEN, "qwen" },
|
|
|
|
| 214 |
{ LLM_ARCH_PHI2, "phi2" },
|
| 215 |
{ LLM_ARCH_PLAMO, "plamo" },
|
|
|
|
| 216 |
};
|
| 217 |
|
| 218 |
enum llm_kv {
|
|
@@ -566,6 +570,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
| 566 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 567 |
},
|
| 568 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
{
|
| 570 |
LLM_ARCH_PHI2,
|
| 571 |
{
|
|
@@ -600,6 +621,26 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
| 600 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 601 |
},
|
| 602 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 603 |
|
| 604 |
{
|
| 605 |
LLM_ARCH_UNKNOWN,
|
|
@@ -1284,8 +1325,10 @@ static llama_state g_state;
|
|
| 1284 |
// available llama models
|
| 1285 |
enum e_model {
|
| 1286 |
MODEL_UNKNOWN,
|
|
|
|
| 1287 |
MODEL_1B,
|
| 1288 |
MODEL_3B,
|
|
|
|
| 1289 |
MODEL_7B,
|
| 1290 |
MODEL_8B,
|
| 1291 |
MODEL_13B,
|
|
@@ -1599,7 +1642,7 @@ struct llama_model {
|
|
| 1599 |
std::unique_ptr<llama_mmap> mapping;
|
| 1600 |
|
| 1601 |
// objects representing data potentially being locked in memory
|
| 1602 |
-
llama_mlock
|
| 1603 |
llama_mlock mlock_mmap;
|
| 1604 |
|
| 1605 |
// for quantize-stats only
|
|
@@ -1626,6 +1669,9 @@ struct llama_context {
|
|
| 1626 |
for (ggml_backend_t backend : backends) {
|
| 1627 |
ggml_backend_free(backend);
|
| 1628 |
}
|
|
|
|
|
|
|
|
|
|
| 1629 |
}
|
| 1630 |
|
| 1631 |
llama_cparams cparams;
|
|
@@ -1672,8 +1718,14 @@ struct llama_context {
|
|
| 1672 |
// allocator for the input tensors
|
| 1673 |
ggml_tallocr * alloc = nullptr;
|
| 1674 |
|
| 1675 |
-
//
|
| 1676 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1677 |
|
| 1678 |
#ifdef GGML_USE_MPI
|
| 1679 |
ggml_mpi_context * ctx_mpi = NULL;
|
|
@@ -2257,18 +2309,18 @@ struct llama_model_loader {
|
|
| 2257 |
}
|
| 2258 |
|
| 2259 |
switch (type_max) {
|
| 2260 |
-
case GGML_TYPE_F32:
|
| 2261 |
-
case GGML_TYPE_F16:
|
| 2262 |
-
case GGML_TYPE_Q4_0:
|
| 2263 |
-
case GGML_TYPE_Q4_1:
|
| 2264 |
-
case GGML_TYPE_Q5_0:
|
| 2265 |
-
case GGML_TYPE_Q5_1:
|
| 2266 |
-
case GGML_TYPE_Q8_0:
|
| 2267 |
-
case GGML_TYPE_Q2_K:
|
| 2268 |
-
case GGML_TYPE_Q3_K:
|
| 2269 |
-
case GGML_TYPE_Q4_K:
|
| 2270 |
-
case GGML_TYPE_Q5_K:
|
| 2271 |
-
case GGML_TYPE_Q6_K:
|
| 2272 |
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
| 2273 |
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
| 2274 |
default:
|
|
@@ -2618,6 +2670,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 2618 |
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
| 2619 |
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
|
| 2620 |
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
|
|
|
| 2621 |
|
| 2622 |
default: return "unknown, may not work";
|
| 2623 |
}
|
|
@@ -2833,6 +2886,7 @@ static void llm_load_hparams(
|
|
| 2833 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 2834 |
|
| 2835 |
switch (hparams.n_layer) {
|
|
|
|
| 2836 |
case 32: model.type = e_model::MODEL_3B; break;
|
| 2837 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2838 |
}
|
|
@@ -2847,6 +2901,17 @@ static void llm_load_hparams(
|
|
| 2847 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2848 |
}
|
| 2849 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2850 |
case LLM_ARCH_PHI2:
|
| 2851 |
{
|
| 2852 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -2877,6 +2942,14 @@ static void llm_load_hparams(
|
|
| 2877 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2878 |
}
|
| 2879 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2880 |
|
| 2881 |
default: (void)0;
|
| 2882 |
}
|
|
@@ -3438,7 +3511,12 @@ static bool llm_load_tensors(
|
|
| 3438 |
{
|
| 3439 |
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 3440 |
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
| 3441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3442 |
}
|
| 3443 |
|
| 3444 |
for (int i = 0; i < n_layer; ++i) {
|
|
@@ -3632,6 +3710,11 @@ static bool llm_load_tensors(
|
|
| 3632 |
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
| 3633 |
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 3634 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3635 |
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 3636 |
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
| 3637 |
|
|
@@ -3669,6 +3752,41 @@ static bool llm_load_tensors(
|
|
| 3669 |
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
|
| 3670 |
}
|
| 3671 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3672 |
case LLM_ARCH_PHI2:
|
| 3673 |
{
|
| 3674 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
@@ -3779,6 +3897,42 @@ static bool llm_load_tensors(
|
|
| 3779 |
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
| 3780 |
}
|
| 3781 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3782 |
default:
|
| 3783 |
throw std::runtime_error("unknown architecture");
|
| 3784 |
}
|
|
@@ -3815,8 +3969,10 @@ static bool llm_load_tensors(
|
|
| 3815 |
else {
|
| 3816 |
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
| 3817 |
if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
|
| 3818 |
-
model.
|
| 3819 |
-
model.
|
|
|
|
|
|
|
| 3820 |
}
|
| 3821 |
}
|
| 3822 |
if (buf == nullptr) {
|
|
@@ -3942,22 +4098,24 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
| 3942 |
const llama_hparams & hparams,
|
| 3943 |
const llama_batch & batch,
|
| 3944 |
struct ggml_tensor * tok_embd,
|
|
|
|
|
|
|
| 3945 |
const llm_build_cb & cb) {
|
| 3946 |
const int64_t n_embd = hparams.n_embd;
|
| 3947 |
|
| 3948 |
struct ggml_tensor * inpL;
|
| 3949 |
|
| 3950 |
if (batch.token) {
|
| 3951 |
-
struct ggml_tensor *
|
| 3952 |
cb(inp_tokens, "inp_tokens", -1);
|
| 3953 |
|
| 3954 |
-
inpL = ggml_get_rows(ctx, tok_embd,
|
| 3955 |
} else {
|
| 3956 |
#ifdef GGML_USE_MPI
|
| 3957 |
GGML_ASSERT(false && "not implemented");
|
| 3958 |
#endif
|
| 3959 |
|
| 3960 |
-
inpL =
|
| 3961 |
}
|
| 3962 |
|
| 3963 |
return inpL;
|
|
@@ -3971,6 +4129,7 @@ static void llm_build_k_shift(
|
|
| 3971 |
const llama_cparams & cparams,
|
| 3972 |
const llama_kv_cache & kv,
|
| 3973 |
struct ggml_cgraph * graph,
|
|
|
|
| 3974 |
llm_rope_type type,
|
| 3975 |
int64_t n_ctx,
|
| 3976 |
float freq_base,
|
|
@@ -3987,9 +4146,6 @@ static void llm_build_k_shift(
|
|
| 3987 |
const float beta_fast = cparams.yarn_beta_fast;
|
| 3988 |
const float beta_slow = cparams.yarn_beta_slow;
|
| 3989 |
|
| 3990 |
-
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
| 3991 |
-
cb(K_shift, "K_shift", -1);
|
| 3992 |
-
|
| 3993 |
int rope_type = 0;
|
| 3994 |
|
| 3995 |
switch (type) {
|
|
@@ -4177,6 +4333,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
| 4177 |
const llama_model & model,
|
| 4178 |
const llama_hparams & hparams,
|
| 4179 |
const llama_kv_cache & kv,
|
|
|
|
| 4180 |
struct ggml_tensor * wo,
|
| 4181 |
struct ggml_tensor * wo_b,
|
| 4182 |
struct ggml_tensor * q_cur,
|
|
@@ -4255,6 +4412,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
| 4255 |
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
| 4256 |
cb(cur, "kqv_merged_cont", il);
|
| 4257 |
|
|
|
|
|
|
|
| 4258 |
cur = ggml_mul_mat(ctx, wo, cur);
|
| 4259 |
if (wo_b) {
|
| 4260 |
cb(cur, "kqv_wo", il);
|
|
@@ -4267,8 +4426,47 @@ static struct ggml_tensor * llm_build_kqv(
|
|
| 4267 |
return cur;
|
| 4268 |
}
|
| 4269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4270 |
struct llm_build_context {
|
| 4271 |
const llama_model & model;
|
|
|
|
| 4272 |
const llama_hparams & hparams;
|
| 4273 |
const llama_cparams & cparams;
|
| 4274 |
const llama_batch & batch;
|
|
@@ -4315,6 +4513,7 @@ struct llm_build_context {
|
|
| 4315 |
const llm_build_cb & cb,
|
| 4316 |
bool worst_case) :
|
| 4317 |
model (lctx.model),
|
|
|
|
| 4318 |
hparams (model.hparams),
|
| 4319 |
cparams (lctx.cparams),
|
| 4320 |
batch (batch),
|
|
@@ -4375,20 +4574,20 @@ struct llm_build_context {
|
|
| 4375 |
struct ggml_tensor * cur;
|
| 4376 |
struct ggml_tensor * inpL;
|
| 4377 |
|
| 4378 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 4379 |
cb(inpL, "inp_embd", -1);
|
| 4380 |
|
| 4381 |
// inp_pos - contains the positions
|
| 4382 |
-
struct ggml_tensor * inp_pos =
|
| 4383 |
cb(inp_pos, "inp_pos", -1);
|
| 4384 |
|
| 4385 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 4386 |
-
struct ggml_tensor * KQ_mask =
|
| 4387 |
cb(KQ_mask, "KQ_mask", -1);
|
| 4388 |
|
| 4389 |
// shift the entire K-cache if needed
|
| 4390 |
if (do_rope_shift) {
|
| 4391 |
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 4392 |
}
|
| 4393 |
|
| 4394 |
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -4424,12 +4623,6 @@ struct llm_build_context {
|
|
| 4424 |
cb(Vcur, "Vcur", il);
|
| 4425 |
}
|
| 4426 |
|
| 4427 |
-
// these nodes are added to the graph together so that they are not reordered
|
| 4428 |
-
// by doing so, the number of splits in the graph is reduced
|
| 4429 |
-
ggml_build_forward_expand(gf, Qcur);
|
| 4430 |
-
ggml_build_forward_expand(gf, Kcur);
|
| 4431 |
-
ggml_build_forward_expand(gf, Vcur);
|
| 4432 |
-
|
| 4433 |
Qcur = ggml_rope_custom(
|
| 4434 |
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 4435 |
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
@@ -4444,11 +4637,9 @@ struct llm_build_context {
|
|
| 4444 |
);
|
| 4445 |
cb(Kcur, "Kcur", il);
|
| 4446 |
|
| 4447 |
-
|
| 4448 |
-
|
| 4449 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 4450 |
model.layers[il].wo, model.layers[il].bo,
|
| 4451 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 4452 |
cb(cur, "kqv_out", il);
|
| 4453 |
}
|
| 4454 |
|
|
@@ -4567,20 +4758,20 @@ struct llm_build_context {
|
|
| 4567 |
struct ggml_tensor * cur;
|
| 4568 |
struct ggml_tensor * inpL;
|
| 4569 |
|
| 4570 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 4571 |
cb(inpL, "inp_embd", -1);
|
| 4572 |
|
| 4573 |
// inp_pos - contains the positions
|
| 4574 |
-
struct ggml_tensor * inp_pos =
|
| 4575 |
cb(inp_pos, "inp_pos", -1);
|
| 4576 |
|
| 4577 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 4578 |
-
struct ggml_tensor * KQ_mask =
|
| 4579 |
cb(KQ_mask, "KQ_mask", -1);
|
| 4580 |
|
| 4581 |
// shift the entire K-cache if needed
|
| 4582 |
if (do_rope_shift) {
|
| 4583 |
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 4584 |
}
|
| 4585 |
|
| 4586 |
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -4625,14 +4816,13 @@ struct llm_build_context {
|
|
| 4625 |
cb(Qcur, "Qcur", il);
|
| 4626 |
cb(Kcur, "Kcur", il);
|
| 4627 |
|
| 4628 |
-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
| 4629 |
|
| 4630 |
// apply ALiBi for 13B model
|
| 4631 |
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
| 4632 |
|
| 4633 |
-
cur =
|
| 4634 |
model.layers[il].wo, NULL,
|
| 4635 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 4636 |
cb(cur, "kqv_out", il);
|
| 4637 |
}
|
| 4638 |
|
|
@@ -4689,20 +4879,20 @@ struct llm_build_context {
|
|
| 4689 |
struct ggml_tensor * cur;
|
| 4690 |
struct ggml_tensor * inpL;
|
| 4691 |
|
| 4692 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 4693 |
cb(inpL, "inp_embd", -1);
|
| 4694 |
|
| 4695 |
// inp_pos - contains the positions
|
| 4696 |
-
struct ggml_tensor * inp_pos =
|
| 4697 |
cb(inp_pos, "inp_pos", -1);
|
| 4698 |
|
| 4699 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 4700 |
-
struct ggml_tensor * KQ_mask =
|
| 4701 |
cb(KQ_mask, "KQ_mask", -1);
|
| 4702 |
|
| 4703 |
// shift the entire K-cache if needed
|
| 4704 |
if (do_rope_shift) {
|
| 4705 |
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 4706 |
}
|
| 4707 |
|
| 4708 |
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -4754,11 +4944,9 @@ struct llm_build_context {
|
|
| 4754 |
);
|
| 4755 |
cb(Kcur, "Kcur", il);
|
| 4756 |
|
| 4757 |
-
|
| 4758 |
-
|
| 4759 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 4760 |
model.layers[il].wo, NULL,
|
| 4761 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 4762 |
cb(cur, "kqv_out", il);
|
| 4763 |
}
|
| 4764 |
|
|
@@ -4813,15 +5001,15 @@ struct llm_build_context {
|
|
| 4813 |
struct ggml_tensor * pos;
|
| 4814 |
struct ggml_tensor * inpL;
|
| 4815 |
|
| 4816 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 4817 |
cb(inpL, "inp_embd", -1);
|
| 4818 |
|
| 4819 |
// inp_pos - contains the positions
|
| 4820 |
-
struct ggml_tensor * inp_pos =
|
| 4821 |
cb(inp_pos, "inp_pos", -1);
|
| 4822 |
|
| 4823 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 4824 |
-
struct ggml_tensor * KQ_mask =
|
| 4825 |
cb(KQ_mask, "KQ_mask", -1);
|
| 4826 |
|
| 4827 |
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
@@ -4855,11 +5043,9 @@ struct llm_build_context {
|
|
| 4855 |
|
| 4856 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 4857 |
|
| 4858 |
-
|
| 4859 |
-
|
| 4860 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 4861 |
model.layers[il].wo, model.layers[il].bo,
|
| 4862 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 4863 |
cb(cur, "kqv_out", il);
|
| 4864 |
}
|
| 4865 |
|
|
@@ -4912,19 +5098,19 @@ struct llm_build_context {
|
|
| 4912 |
struct ggml_tensor * cur;
|
| 4913 |
struct ggml_tensor * inpL;
|
| 4914 |
|
| 4915 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 4916 |
cb(inpL, "inp_embd", -1);
|
| 4917 |
|
| 4918 |
// inp_pos - contains the positions
|
| 4919 |
-
struct ggml_tensor * inp_pos =
|
| 4920 |
cb(inp_pos, "inp_pos", -1);
|
| 4921 |
|
| 4922 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 4923 |
-
struct ggml_tensor * KQ_mask =
|
| 4924 |
cb(KQ_mask, "KQ_mask", -1);
|
| 4925 |
|
| 4926 |
if (do_rope_shift) {
|
| 4927 |
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 4928 |
}
|
| 4929 |
|
| 4930 |
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5062,12 +5248,9 @@ struct llm_build_context {
|
|
| 5062 |
);
|
| 5063 |
cb(Vcur, "Vcur", il);
|
| 5064 |
|
| 5065 |
-
|
| 5066 |
-
|
| 5067 |
-
// TODO: not tested, could be broken
|
| 5068 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 5069 |
model.layers[il].wo, model.layers[il].bo,
|
| 5070 |
-
Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5071 |
cb(cur, "kqv_out", il);
|
| 5072 |
}
|
| 5073 |
|
|
@@ -5122,11 +5305,11 @@ struct llm_build_context {
|
|
| 5122 |
struct ggml_tensor * cur;
|
| 5123 |
struct ggml_tensor * inpL;
|
| 5124 |
|
| 5125 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 5126 |
cb(inpL, "inp_embd", -1);
|
| 5127 |
|
| 5128 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5129 |
-
struct ggml_tensor * KQ_mask =
|
| 5130 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5131 |
|
| 5132 |
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5154,11 +5337,9 @@ struct llm_build_context {
|
|
| 5154 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 5155 |
cb(Qcur, "Qcur", il);
|
| 5156 |
|
| 5157 |
-
|
| 5158 |
-
|
| 5159 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 5160 |
model.layers[il].wo, NULL,
|
| 5161 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5162 |
cb(cur, "kqv_out", il);
|
| 5163 |
}
|
| 5164 |
|
|
@@ -5214,11 +5395,11 @@ struct llm_build_context {
|
|
| 5214 |
struct ggml_tensor * cur;
|
| 5215 |
struct ggml_tensor * inpL;
|
| 5216 |
|
| 5217 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 5218 |
cb(inpL, "inp_embd", -1);
|
| 5219 |
|
| 5220 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5221 |
-
struct ggml_tensor * KQ_mask =
|
| 5222 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5223 |
|
| 5224 |
inpL = llm_build_norm(ctx0, inpL, hparams,
|
|
@@ -5252,11 +5433,9 @@ struct llm_build_context {
|
|
| 5252 |
|
| 5253 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 5254 |
|
| 5255 |
-
|
| 5256 |
-
|
| 5257 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 5258 |
model.layers[il].wo, model.layers[il].bo,
|
| 5259 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5260 |
cb(cur, "kqv_out", il);
|
| 5261 |
}
|
| 5262 |
|
|
@@ -5309,11 +5488,11 @@ struct llm_build_context {
|
|
| 5309 |
struct ggml_tensor * cur;
|
| 5310 |
struct ggml_tensor * inpL;
|
| 5311 |
|
| 5312 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 5313 |
cb(inpL, "inp_embd", -1);
|
| 5314 |
|
| 5315 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5316 |
-
struct ggml_tensor * KQ_mask =
|
| 5317 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5318 |
|
| 5319 |
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5347,11 +5526,9 @@ struct llm_build_context {
|
|
| 5347 |
|
| 5348 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 5349 |
|
| 5350 |
-
|
| 5351 |
-
|
| 5352 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 5353 |
model.layers[il].wo, NULL,
|
| 5354 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5355 |
cb(cur, "kqv_out", il);
|
| 5356 |
}
|
| 5357 |
|
|
@@ -5407,20 +5584,20 @@ struct llm_build_context {
|
|
| 5407 |
struct ggml_tensor * cur;
|
| 5408 |
struct ggml_tensor * inpL;
|
| 5409 |
|
| 5410 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 5411 |
cb(inpL, "inp_embd", -1);
|
| 5412 |
|
| 5413 |
// inp_pos - contains the positions
|
| 5414 |
-
struct ggml_tensor * inp_pos =
|
| 5415 |
cb(inp_pos, "inp_pos", -1);
|
| 5416 |
|
| 5417 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5418 |
-
struct ggml_tensor * KQ_mask =
|
| 5419 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5420 |
|
| 5421 |
// shift the entire K-cache if needed
|
| 5422 |
if (do_rope_shift) {
|
| 5423 |
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 5424 |
}
|
| 5425 |
|
| 5426 |
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5438,12 +5615,24 @@ struct llm_build_context {
|
|
| 5438 |
// compute Q and K and RoPE them
|
| 5439 |
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
| 5440 |
cb(Qcur, "Qcur", il);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5441 |
|
| 5442 |
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
| 5443 |
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5444 |
|
| 5445 |
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
| 5446 |
cb(Vcur, "Vcur", il);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5447 |
|
| 5448 |
Qcur = ggml_rope_custom(
|
| 5449 |
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
@@ -5459,11 +5648,9 @@ struct llm_build_context {
|
|
| 5459 |
);
|
| 5460 |
cb(Kcur, "Kcur", il);
|
| 5461 |
|
| 5462 |
-
|
| 5463 |
-
|
| 5464 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 5465 |
model.layers[il].wo, NULL,
|
| 5466 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5467 |
cb(cur, "kqv_out", il);
|
| 5468 |
}
|
| 5469 |
|
|
@@ -5520,20 +5707,20 @@ struct llm_build_context {
|
|
| 5520 |
struct ggml_tensor * cur;
|
| 5521 |
struct ggml_tensor * inpL;
|
| 5522 |
|
| 5523 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 5524 |
cb(inpL, "inp_embd", -1);
|
| 5525 |
|
| 5526 |
// inp_pos - contains the positions
|
| 5527 |
-
struct ggml_tensor * inp_pos =
|
| 5528 |
cb(inp_pos, "inp_pos", -1);
|
| 5529 |
|
| 5530 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5531 |
-
struct ggml_tensor * KQ_mask =
|
| 5532 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5533 |
|
| 5534 |
// shift the entire K-cache if needed
|
| 5535 |
if (do_rope_shift) {
|
| 5536 |
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 5537 |
}
|
| 5538 |
|
| 5539 |
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5576,11 +5763,9 @@ struct llm_build_context {
|
|
| 5576 |
);
|
| 5577 |
cb(Kcur, "Kcur", il);
|
| 5578 |
|
| 5579 |
-
|
| 5580 |
-
|
| 5581 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 5582 |
model.layers[il].wo, NULL,
|
| 5583 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5584 |
cb(cur, "kqv_out", il);
|
| 5585 |
}
|
| 5586 |
|
|
@@ -5625,6 +5810,126 @@ struct llm_build_context {
|
|
| 5625 |
|
| 5626 |
return gf;
|
| 5627 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5628 |
struct ggml_cgraph * build_phi2() {
|
| 5629 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5630 |
|
|
@@ -5637,20 +5942,20 @@ struct llm_build_context {
|
|
| 5637 |
struct ggml_tensor * ffn_output;
|
| 5638 |
struct ggml_tensor * inpL;
|
| 5639 |
|
| 5640 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 5641 |
cb(inpL, "inp_embd", -1);
|
| 5642 |
|
| 5643 |
// inp_pos - contains the positions
|
| 5644 |
-
struct ggml_tensor * inp_pos =
|
| 5645 |
cb(inp_pos, "inp_pos", -1);
|
| 5646 |
|
| 5647 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5648 |
-
struct ggml_tensor * KQ_mask =
|
| 5649 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5650 |
|
| 5651 |
// shift the entire K-cache if needed
|
| 5652 |
if (do_rope_shift) {
|
| 5653 |
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 5654 |
}
|
| 5655 |
|
| 5656 |
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5706,11 +6011,9 @@ struct llm_build_context {
|
|
| 5706 |
);
|
| 5707 |
cb(Kcur, "Kcur", il);
|
| 5708 |
|
| 5709 |
-
|
| 5710 |
-
|
| 5711 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 5712 |
model.layers[il].wo, model.layers[il].bo,
|
| 5713 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
|
| 5714 |
cb(cur, "kqv_out", il);
|
| 5715 |
}
|
| 5716 |
|
|
@@ -5761,20 +6064,20 @@ struct llm_build_context {
|
|
| 5761 |
struct ggml_tensor * cur;
|
| 5762 |
struct ggml_tensor * inpL;
|
| 5763 |
|
| 5764 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 5765 |
cb(inpL, "inp_embd", -1);
|
| 5766 |
|
| 5767 |
// inp_pos - contains the positions
|
| 5768 |
-
struct ggml_tensor * inp_pos =
|
| 5769 |
cb(inp_pos, "inp_pos", -1);
|
| 5770 |
|
| 5771 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5772 |
-
struct ggml_tensor * KQ_mask =
|
| 5773 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5774 |
|
| 5775 |
// shift the entire K-cache if needed
|
| 5776 |
if (do_rope_shift) {
|
| 5777 |
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 5778 |
}
|
| 5779 |
|
| 5780 |
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5811,11 +6114,9 @@ struct llm_build_context {
|
|
| 5811 |
ext_factor, attn_factor, beta_fast, beta_slow);
|
| 5812 |
cb(Kcur, "Kcur", il);
|
| 5813 |
|
| 5814 |
-
|
| 5815 |
-
|
| 5816 |
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
| 5817 |
model.layers[il].wo, NULL,
|
| 5818 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5819 |
cb(cur, "kqv_out", il);
|
| 5820 |
}
|
| 5821 |
struct ggml_tensor * sa_out = cur;
|
|
@@ -5870,15 +6171,15 @@ struct llm_build_context {
|
|
| 5870 |
struct ggml_tensor * pos;
|
| 5871 |
struct ggml_tensor * inpL;
|
| 5872 |
|
| 5873 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
| 5874 |
cb(inpL, "inp_embd", -1);
|
| 5875 |
|
| 5876 |
// inp_pos - contains the positions
|
| 5877 |
-
struct ggml_tensor * inp_pos =
|
| 5878 |
cb(inp_pos, "inp_pos", -1);
|
| 5879 |
|
| 5880 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5881 |
-
struct ggml_tensor * KQ_mask =
|
| 5882 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5883 |
|
| 5884 |
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
@@ -5912,11 +6213,118 @@ struct llm_build_context {
|
|
| 5912 |
|
| 5913 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 5914 |
|
| 5915 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5916 |
|
| 5917 |
-
cur =
|
| 5918 |
model.layers[il].wo, model.layers[il].bo,
|
| 5919 |
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5920 |
cb(cur, "kqv_out", il);
|
| 5921 |
}
|
| 5922 |
|
|
@@ -5968,15 +6376,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 5968 |
// check if we should build the worst-case graph (for memory measurement)
|
| 5969 |
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
| 5970 |
|
| 5971 |
-
// keep track of the input that has already been allocated
|
| 5972 |
-
bool alloc_inp_tokens = false;
|
| 5973 |
-
bool alloc_inp_embd = false;
|
| 5974 |
-
bool alloc_inp_pos = false;
|
| 5975 |
-
bool alloc_inp_KQ_mask = false;
|
| 5976 |
-
bool alloc_inp_K_shift = false;
|
| 5977 |
-
|
| 5978 |
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
| 5979 |
-
// TODO: improve handling of input and output tensors, then replace this with ggml_set_name
|
| 5980 |
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
| 5981 |
if (il >= 0) {
|
| 5982 |
ggml_format_name(cur, "%s-%d", name, il);
|
|
@@ -5984,118 +6384,78 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 5984 |
ggml_set_name(cur, name);
|
| 5985 |
}
|
| 5986 |
|
| 5987 |
-
|
| 5988 |
-
|
| 5989 |
-
|
| 5990 |
-
|
| 5991 |
-
if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
|
| 5992 |
-
ggml_tallocr_alloc(lctx.alloc, cur);
|
| 5993 |
-
|
| 5994 |
-
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
|
| 5995 |
-
const int64_t n_tokens = cur->ne[0];
|
| 5996 |
-
|
| 5997 |
-
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
|
| 5998 |
}
|
| 5999 |
-
|
| 6000 |
-
alloc_inp_tokens = true;
|
| 6001 |
}
|
|
|
|
|
|
|
|
|
|
| 6002 |
|
| 6003 |
-
|
| 6004 |
-
ggml_tallocr_alloc(lctx.alloc, cur);
|
| 6005 |
|
| 6006 |
-
|
| 6007 |
-
|
| 6008 |
-
|
| 6009 |
|
| 6010 |
-
|
| 6011 |
-
|
|
|
|
| 6012 |
|
| 6013 |
-
|
| 6014 |
}
|
| 6015 |
|
| 6016 |
-
if (
|
| 6017 |
-
|
|
|
|
| 6018 |
|
| 6019 |
-
|
| 6020 |
-
|
| 6021 |
|
| 6022 |
-
|
| 6023 |
-
|
| 6024 |
-
}
|
| 6025 |
|
| 6026 |
-
|
| 6027 |
}
|
| 6028 |
|
| 6029 |
-
|
| 6030 |
-
|
|
|
|
| 6031 |
|
| 6032 |
-
|
| 6033 |
-
|
| 6034 |
-
const int64_t n_tokens = cur->ne[1];
|
| 6035 |
|
| 6036 |
-
|
| 6037 |
-
|
| 6038 |
-
|
| 6039 |
-
|
| 6040 |
-
lctx.buf_copy.resize(ggml_nbytes(cur));
|
| 6041 |
-
data = (float *) lctx.buf_copy.data();
|
| 6042 |
-
}
|
| 6043 |
|
| 6044 |
-
|
| 6045 |
-
|
| 6046 |
-
|
| 6047 |
-
|
| 6048 |
-
|
| 6049 |
-
|
| 6050 |
-
float f;
|
| 6051 |
-
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
| 6052 |
-
f = -INFINITY;
|
| 6053 |
-
} else {
|
| 6054 |
-
f = 0;
|
| 6055 |
-
}
|
| 6056 |
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
| 6057 |
}
|
|
|
|
| 6058 |
}
|
| 6059 |
}
|
| 6060 |
-
|
| 6061 |
-
if (data != cur->data) {
|
| 6062 |
-
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
| 6063 |
-
}
|
| 6064 |
}
|
| 6065 |
-
|
| 6066 |
-
alloc_inp_KQ_mask = true;
|
| 6067 |
}
|
| 6068 |
|
| 6069 |
-
if (
|
| 6070 |
-
|
| 6071 |
-
|
| 6072 |
-
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
| 6073 |
-
const int64_t n_ctx = cur->ne[0];
|
| 6074 |
|
| 6075 |
-
|
| 6076 |
-
|
| 6077 |
-
data = (int32_t *) cur->data;
|
| 6078 |
-
} else {
|
| 6079 |
-
lctx.buf_copy.resize(ggml_nbytes(cur));
|
| 6080 |
-
data = (int32_t *) lctx.buf_copy.data();
|
| 6081 |
-
}
|
| 6082 |
|
| 6083 |
-
|
| 6084 |
-
|
| 6085 |
-
}
|
| 6086 |
-
|
| 6087 |
-
if (data != cur->data) {
|
| 6088 |
-
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
| 6089 |
-
}
|
| 6090 |
}
|
| 6091 |
-
|
| 6092 |
-
alloc_inp_K_shift = true;
|
| 6093 |
}
|
| 6094 |
-
}
|
| 6095 |
-
|
| 6096 |
-
struct ggml_cgraph * result = NULL;
|
| 6097 |
-
|
| 6098 |
-
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
| 6099 |
|
| 6100 |
llm.init();
|
| 6101 |
|
|
@@ -6140,6 +6500,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 6140 |
{
|
| 6141 |
result = llm.build_qwen();
|
| 6142 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6143 |
case LLM_ARCH_PHI2:
|
| 6144 |
{
|
| 6145 |
result = llm.build_phi2();
|
|
@@ -6152,6 +6516,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 6152 |
{
|
| 6153 |
result = llm.build_gpt2();
|
| 6154 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6155 |
default:
|
| 6156 |
GGML_ASSERT(false);
|
| 6157 |
}
|
|
@@ -7588,10 +7956,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
|
|
| 7588 |
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
| 7589 |
return a.logit > b.logit;
|
| 7590 |
};
|
| 7591 |
-
if (k
|
| 7592 |
-
std::sort(candidates->data, candidates->data + candidates->size, comp);
|
| 7593 |
-
} else {
|
| 7594 |
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7595 |
}
|
| 7596 |
candidates->sorted = true;
|
| 7597 |
}
|
|
@@ -7783,6 +8198,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
| 7783 |
}
|
| 7784 |
}
|
| 7785 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7786 |
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
| 7787 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 7788 |
|
|
@@ -8371,9 +8853,13 @@ struct quantize_state_internal {
|
|
| 8371 |
const llama_model_quantize_params * params;
|
| 8372 |
|
| 8373 |
int n_attention_wv = 0;
|
| 8374 |
-
int
|
|
|
|
|
|
|
| 8375 |
int i_attention_wv = 0;
|
| 8376 |
-
int
|
|
|
|
|
|
|
| 8377 |
|
| 8378 |
int n_k_quantized = 0;
|
| 8379 |
int n_fallback = 0;
|
|
@@ -8457,6 +8943,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8457 |
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
| 8458 |
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
| 8459 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8460 |
|
| 8461 |
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
| 8462 |
int nx = tensor->ne[0];
|
|
@@ -8476,8 +8979,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8476 |
++qs.i_attention_wv;
|
| 8477 |
}
|
| 8478 |
else if (name.find("ffn_down") != std::string::npos) {
|
| 8479 |
-
if (qs.
|
| 8480 |
-
++qs.
|
| 8481 |
}
|
| 8482 |
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
| 8483 |
} else if (name.find("attn_v.weight") != std::string::npos) {
|
|
@@ -8514,27 +9017,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8514 |
// TODO: explore better strategies
|
| 8515 |
new_type = GGML_TYPE_Q8_0;
|
| 8516 |
}
|
| 8517 |
-
|
| 8518 |
-
|
| 8519 |
-
int i_layer, n_layer;
|
| 8520 |
-
if (n_expert == 1) {
|
| 8521 |
-
i_layer = qs.i_feed_forward_w2;
|
| 8522 |
-
n_layer = qs.n_feed_forward_w2;
|
| 8523 |
-
} else {
|
| 8524 |
-
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
| 8525 |
-
// sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
|
| 8526 |
-
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
| 8527 |
-
// tensor name.
|
| 8528 |
-
n_layer = qs.n_feed_forward_w2 / n_expert;
|
| 8529 |
-
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
|
| 8530 |
-
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
|
| 8531 |
-
}
|
| 8532 |
-
if (i_layer < 0 || i_layer >= n_layer) {
|
| 8533 |
-
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
|
| 8534 |
-
}
|
| 8535 |
}
|
|
|
|
|
|
|
|
|
|
| 8536 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 8537 |
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
| 8538 |
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
| 8539 |
}
|
| 8540 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
@@ -8564,11 +9054,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8564 |
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
|
| 8565 |
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
| 8566 |
}
|
| 8567 |
-
++qs.
|
| 8568 |
} else if (name.find("attn_output.weight") != std::string::npos) {
|
| 8569 |
if (arch != LLM_ARCH_FALCON) {
|
| 8570 |
if (qs.model.hparams.n_expert == 8) {
|
| 8571 |
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype ==
|
|
|
|
| 8572 |
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 8573 |
new_type = GGML_TYPE_Q5_K;
|
| 8574 |
}
|
|
@@ -8586,6 +9077,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8586 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
| 8587 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
| 8588 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8589 |
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
| 8590 |
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
|
| 8591 |
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
@@ -8640,8 +9149,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8640 |
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
| 8641 |
|
| 8642 |
// K-quants
|
|
|
|
| 8643 |
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
| 8644 |
-
case
|
| 8645 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
| 8646 |
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
| 8647 |
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
|
@@ -8709,12 +9219,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8709 |
++qs.n_attention_wv;
|
| 8710 |
}
|
| 8711 |
else if (name.find("ffn_down") != std::string::npos) {
|
| 8712 |
-
++qs.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8713 |
}
|
| 8714 |
}
|
| 8715 |
-
if (qs.n_attention_wv != qs.
|
| 8716 |
-
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d,
|
| 8717 |
-
__func__, qs.n_attention_wv, qs.
|
| 8718 |
}
|
| 8719 |
|
| 8720 |
size_t total_size_org = 0;
|
|
@@ -9522,6 +10038,35 @@ struct llama_context * llama_new_context_with_model(
|
|
| 9522 |
ctx->embedding.resize(hparams.n_embd);
|
| 9523 |
}
|
| 9524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9525 |
{
|
| 9526 |
// buffer types used for the compute buffer of each backend
|
| 9527 |
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
|
@@ -9548,9 +10093,6 @@ struct llama_context * llama_new_context_with_model(
|
|
| 9548 |
|
| 9549 |
// initialize scheduler with the worst-case graph
|
| 9550 |
ggml_backend_sched_init_measure(ctx->sched, gf);
|
| 9551 |
-
// note: the number of splits during measure is higher than during inference due to the kv shift
|
| 9552 |
-
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
| 9553 |
-
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
| 9554 |
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
| 9555 |
|
| 9556 |
for (ggml_backend_t backend : ctx->backends) {
|
|
@@ -9559,6 +10101,10 @@ struct llama_context * llama_new_context_with_model(
|
|
| 9559 |
ggml_backend_buffer_name(buf),
|
| 9560 |
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
| 9561 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9562 |
}
|
| 9563 |
}
|
| 9564 |
|
|
|
|
| 192 |
LLM_ARCH_BLOOM,
|
| 193 |
LLM_ARCH_STABLELM,
|
| 194 |
LLM_ARCH_QWEN,
|
| 195 |
+
LLM_ARCH_QWEN2,
|
| 196 |
LLM_ARCH_PHI2,
|
| 197 |
LLM_ARCH_PLAMO,
|
| 198 |
+
LLM_ARCH_CODESHELL,
|
| 199 |
LLM_ARCH_UNKNOWN,
|
| 200 |
};
|
| 201 |
|
|
|
|
| 213 |
{ LLM_ARCH_BLOOM, "bloom" },
|
| 214 |
{ LLM_ARCH_STABLELM, "stablelm" },
|
| 215 |
{ LLM_ARCH_QWEN, "qwen" },
|
| 216 |
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
| 217 |
{ LLM_ARCH_PHI2, "phi2" },
|
| 218 |
{ LLM_ARCH_PLAMO, "plamo" },
|
| 219 |
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
| 220 |
};
|
| 221 |
|
| 222 |
enum llm_kv {
|
|
|
|
| 570 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 571 |
},
|
| 572 |
},
|
| 573 |
+
{
|
| 574 |
+
LLM_ARCH_QWEN2,
|
| 575 |
+
{
|
| 576 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 577 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 578 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 579 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 580 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 581 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 582 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 583 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 584 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 585 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 586 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 587 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 588 |
+
},
|
| 589 |
+
},
|
| 590 |
{
|
| 591 |
LLM_ARCH_PHI2,
|
| 592 |
{
|
|
|
|
| 621 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 622 |
},
|
| 623 |
},
|
| 624 |
+
{
|
| 625 |
+
LLM_ARCH_CODESHELL,
|
| 626 |
+
{
|
| 627 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 628 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 629 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 630 |
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
| 631 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 632 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 633 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 634 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 635 |
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
| 636 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 637 |
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
| 638 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 639 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 640 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 641 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 642 |
+
},
|
| 643 |
+
},
|
| 644 |
|
| 645 |
{
|
| 646 |
LLM_ARCH_UNKNOWN,
|
|
|
|
| 1325 |
// available llama models
|
| 1326 |
enum e_model {
|
| 1327 |
MODEL_UNKNOWN,
|
| 1328 |
+
MODEL_0_5B,
|
| 1329 |
MODEL_1B,
|
| 1330 |
MODEL_3B,
|
| 1331 |
+
MODEL_4B,
|
| 1332 |
MODEL_7B,
|
| 1333 |
MODEL_8B,
|
| 1334 |
MODEL_13B,
|
|
|
|
| 1642 |
std::unique_ptr<llama_mmap> mapping;
|
| 1643 |
|
| 1644 |
// objects representing data potentially being locked in memory
|
| 1645 |
+
std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
|
| 1646 |
llama_mlock mlock_mmap;
|
| 1647 |
|
| 1648 |
// for quantize-stats only
|
|
|
|
| 1669 |
for (ggml_backend_t backend : backends) {
|
| 1670 |
ggml_backend_free(backend);
|
| 1671 |
}
|
| 1672 |
+
|
| 1673 |
+
ggml_backend_buffer_free(buf_input);
|
| 1674 |
+
ggml_free(ctx_input);
|
| 1675 |
}
|
| 1676 |
|
| 1677 |
llama_cparams cparams;
|
|
|
|
| 1718 |
// allocator for the input tensors
|
| 1719 |
ggml_tallocr * alloc = nullptr;
|
| 1720 |
|
| 1721 |
+
// input tensors
|
| 1722 |
+
ggml_backend_buffer_t buf_input = nullptr;
|
| 1723 |
+
ggml_context * ctx_input = nullptr;
|
| 1724 |
+
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
| 1725 |
+
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
| 1726 |
+
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
| 1727 |
+
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
| 1728 |
+
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
| 1729 |
|
| 1730 |
#ifdef GGML_USE_MPI
|
| 1731 |
ggml_mpi_context * ctx_mpi = NULL;
|
|
|
|
| 2309 |
}
|
| 2310 |
|
| 2311 |
switch (type_max) {
|
| 2312 |
+
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
| 2313 |
+
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
| 2314 |
+
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
| 2315 |
+
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
| 2316 |
+
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
| 2317 |
+
case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
|
| 2318 |
+
case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
|
| 2319 |
+
case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
|
| 2320 |
+
case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
|
| 2321 |
+
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
| 2322 |
+
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
| 2323 |
+
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
| 2324 |
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
| 2325 |
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
| 2326 |
default:
|
|
|
|
| 2670 |
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
| 2671 |
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
|
| 2672 |
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
| 2673 |
+
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
| 2674 |
|
| 2675 |
default: return "unknown, may not work";
|
| 2676 |
}
|
|
|
|
| 2886 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 2887 |
|
| 2888 |
switch (hparams.n_layer) {
|
| 2889 |
+
case 24: model.type = e_model::MODEL_1B; break;
|
| 2890 |
case 32: model.type = e_model::MODEL_3B; break;
|
| 2891 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2892 |
}
|
|
|
|
| 2901 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2902 |
}
|
| 2903 |
} break;
|
| 2904 |
+
case LLM_ARCH_QWEN2:
|
| 2905 |
+
{
|
| 2906 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 2907 |
+
switch (hparams.n_layer) {
|
| 2908 |
+
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
|
| 2909 |
+
case 32: model.type = e_model::MODEL_7B; break;
|
| 2910 |
+
case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
|
| 2911 |
+
case 80: model.type = e_model::MODEL_70B; break;
|
| 2912 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2913 |
+
}
|
| 2914 |
+
} break;
|
| 2915 |
case LLM_ARCH_PHI2:
|
| 2916 |
{
|
| 2917 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
|
| 2942 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2943 |
}
|
| 2944 |
} break;
|
| 2945 |
+
case LLM_ARCH_CODESHELL:
|
| 2946 |
+
{
|
| 2947 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 2948 |
+
switch (hparams.n_layer) {
|
| 2949 |
+
case 42: model.type = e_model::MODEL_SMALL; break;
|
| 2950 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2951 |
+
}
|
| 2952 |
+
} break;
|
| 2953 |
|
| 2954 |
default: (void)0;
|
| 2955 |
}
|
|
|
|
| 3511 |
{
|
| 3512 |
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 3513 |
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
| 3514 |
+
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
|
| 3515 |
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
| 3516 |
+
} else {
|
| 3517 |
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
| 3518 |
+
ml.n_created--; // artificial tensor
|
| 3519 |
+
}
|
| 3520 |
}
|
| 3521 |
|
| 3522 |
for (int i = 0; i < n_layer; ++i) {
|
|
|
|
| 3710 |
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
| 3711 |
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 3712 |
|
| 3713 |
+
// optional bias tensors, present in Stable LM 2 1.6B
|
| 3714 |
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
|
| 3715 |
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
| 3716 |
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
| 3717 |
+
|
| 3718 |
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 3719 |
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
| 3720 |
|
|
|
|
| 3752 |
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
|
| 3753 |
}
|
| 3754 |
} break;
|
| 3755 |
+
case LLM_ARCH_QWEN2:
|
| 3756 |
+
{
|
| 3757 |
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 3758 |
+
|
| 3759 |
+
// output
|
| 3760 |
+
{
|
| 3761 |
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 3762 |
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
| 3763 |
+
}
|
| 3764 |
+
|
| 3765 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 3766 |
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
| 3767 |
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
| 3768 |
+
|
| 3769 |
+
auto & layer = model.layers[i];
|
| 3770 |
+
|
| 3771 |
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 3772 |
+
|
| 3773 |
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
| 3774 |
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
| 3775 |
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
| 3776 |
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 3777 |
+
|
| 3778 |
+
// optional bias tensors
|
| 3779 |
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
| 3780 |
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
| 3781 |
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
| 3782 |
+
|
| 3783 |
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 3784 |
+
|
| 3785 |
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
| 3786 |
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
| 3787 |
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 3788 |
+
}
|
| 3789 |
+
} break;
|
| 3790 |
case LLM_ARCH_PHI2:
|
| 3791 |
{
|
| 3792 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
|
|
| 3897 |
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
| 3898 |
}
|
| 3899 |
} break;
|
| 3900 |
+
case LLM_ARCH_CODESHELL:
|
| 3901 |
+
{
|
| 3902 |
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 3903 |
+
|
| 3904 |
+
// output
|
| 3905 |
+
{
|
| 3906 |
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 3907 |
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
| 3908 |
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
| 3909 |
+
}
|
| 3910 |
+
|
| 3911 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 3912 |
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
| 3913 |
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
| 3914 |
+
|
| 3915 |
+
auto & layer = model.layers[i];
|
| 3916 |
+
|
| 3917 |
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 3918 |
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
| 3919 |
+
|
| 3920 |
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
| 3921 |
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
| 3922 |
+
|
| 3923 |
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 3924 |
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
| 3925 |
+
|
| 3926 |
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 3927 |
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
| 3928 |
+
|
| 3929 |
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
| 3930 |
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
| 3931 |
+
|
| 3932 |
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 3933 |
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
| 3934 |
+
}
|
| 3935 |
+
} break;
|
| 3936 |
default:
|
| 3937 |
throw std::runtime_error("unknown architecture");
|
| 3938 |
}
|
|
|
|
| 3969 |
else {
|
| 3970 |
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
| 3971 |
if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
|
| 3972 |
+
model.mlock_bufs.emplace_back(new llama_mlock);
|
| 3973 |
+
auto & mlock_buf = model.mlock_bufs.back();
|
| 3974 |
+
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
| 3975 |
+
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
| 3976 |
}
|
| 3977 |
}
|
| 3978 |
if (buf == nullptr) {
|
|
|
|
| 4098 |
const llama_hparams & hparams,
|
| 4099 |
const llama_batch & batch,
|
| 4100 |
struct ggml_tensor * tok_embd,
|
| 4101 |
+
struct ggml_tensor * inp_tokens,
|
| 4102 |
+
struct ggml_tensor * inp_embd,
|
| 4103 |
const llm_build_cb & cb) {
|
| 4104 |
const int64_t n_embd = hparams.n_embd;
|
| 4105 |
|
| 4106 |
struct ggml_tensor * inpL;
|
| 4107 |
|
| 4108 |
if (batch.token) {
|
| 4109 |
+
struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
|
| 4110 |
cb(inp_tokens, "inp_tokens", -1);
|
| 4111 |
|
| 4112 |
+
inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
|
| 4113 |
} else {
|
| 4114 |
#ifdef GGML_USE_MPI
|
| 4115 |
GGML_ASSERT(false && "not implemented");
|
| 4116 |
#endif
|
| 4117 |
|
| 4118 |
+
inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
|
| 4119 |
}
|
| 4120 |
|
| 4121 |
return inpL;
|
|
|
|
| 4129 |
const llama_cparams & cparams,
|
| 4130 |
const llama_kv_cache & kv,
|
| 4131 |
struct ggml_cgraph * graph,
|
| 4132 |
+
struct ggml_tensor * K_shift,
|
| 4133 |
llm_rope_type type,
|
| 4134 |
int64_t n_ctx,
|
| 4135 |
float freq_base,
|
|
|
|
| 4146 |
const float beta_fast = cparams.yarn_beta_fast;
|
| 4147 |
const float beta_slow = cparams.yarn_beta_slow;
|
| 4148 |
|
|
|
|
|
|
|
|
|
|
| 4149 |
int rope_type = 0;
|
| 4150 |
|
| 4151 |
switch (type) {
|
|
|
|
| 4333 |
const llama_model & model,
|
| 4334 |
const llama_hparams & hparams,
|
| 4335 |
const llama_kv_cache & kv,
|
| 4336 |
+
struct ggml_cgraph * graph,
|
| 4337 |
struct ggml_tensor * wo,
|
| 4338 |
struct ggml_tensor * wo_b,
|
| 4339 |
struct ggml_tensor * q_cur,
|
|
|
|
| 4412 |
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
| 4413 |
cb(cur, "kqv_merged_cont", il);
|
| 4414 |
|
| 4415 |
+
ggml_build_forward_expand(graph, cur);
|
| 4416 |
+
|
| 4417 |
cur = ggml_mul_mat(ctx, wo, cur);
|
| 4418 |
if (wo_b) {
|
| 4419 |
cb(cur, "kqv_wo", il);
|
|
|
|
| 4426 |
return cur;
|
| 4427 |
}
|
| 4428 |
|
| 4429 |
+
static struct ggml_tensor * llm_build_kv(
|
| 4430 |
+
struct ggml_context * ctx,
|
| 4431 |
+
const llama_model & model,
|
| 4432 |
+
const llama_hparams & hparams,
|
| 4433 |
+
const llama_kv_cache & kv,
|
| 4434 |
+
struct ggml_cgraph * graph,
|
| 4435 |
+
struct ggml_tensor * wo,
|
| 4436 |
+
struct ggml_tensor * wo_b,
|
| 4437 |
+
struct ggml_tensor * k_cur,
|
| 4438 |
+
struct ggml_tensor * v_cur,
|
| 4439 |
+
struct ggml_tensor * q_cur,
|
| 4440 |
+
struct ggml_tensor * kq_mask,
|
| 4441 |
+
int64_t n_ctx,
|
| 4442 |
+
int32_t n_tokens,
|
| 4443 |
+
int32_t kv_head,
|
| 4444 |
+
int32_t n_kv,
|
| 4445 |
+
float max_alibi_bias,
|
| 4446 |
+
float kq_scale,
|
| 4447 |
+
const llm_build_cb & cb,
|
| 4448 |
+
int il) {
|
| 4449 |
+
|
| 4450 |
+
// these nodes are added to the graph together so that they are not reordered
|
| 4451 |
+
// by doing so, the number of splits in the graph is reduced
|
| 4452 |
+
ggml_build_forward_expand(graph, q_cur);
|
| 4453 |
+
ggml_build_forward_expand(graph, k_cur);
|
| 4454 |
+
ggml_build_forward_expand(graph, v_cur);
|
| 4455 |
+
|
| 4456 |
+
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
| 4457 |
+
|
| 4458 |
+
struct ggml_tensor * cur;
|
| 4459 |
+
cur = llm_build_kqv(ctx, model, hparams, kv, graph,
|
| 4460 |
+
wo, wo_b,
|
| 4461 |
+
q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
|
| 4462 |
+
cb(cur, "kqv_out", il);
|
| 4463 |
+
|
| 4464 |
+
return cur;
|
| 4465 |
+
}
|
| 4466 |
+
|
| 4467 |
struct llm_build_context {
|
| 4468 |
const llama_model & model;
|
| 4469 |
+
const llama_context & lctx;
|
| 4470 |
const llama_hparams & hparams;
|
| 4471 |
const llama_cparams & cparams;
|
| 4472 |
const llama_batch & batch;
|
|
|
|
| 4513 |
const llm_build_cb & cb,
|
| 4514 |
bool worst_case) :
|
| 4515 |
model (lctx.model),
|
| 4516 |
+
lctx (lctx),
|
| 4517 |
hparams (model.hparams),
|
| 4518 |
cparams (lctx.cparams),
|
| 4519 |
batch (batch),
|
|
|
|
| 4574 |
struct ggml_tensor * cur;
|
| 4575 |
struct ggml_tensor * inpL;
|
| 4576 |
|
| 4577 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 4578 |
cb(inpL, "inp_embd", -1);
|
| 4579 |
|
| 4580 |
// inp_pos - contains the positions
|
| 4581 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 4582 |
cb(inp_pos, "inp_pos", -1);
|
| 4583 |
|
| 4584 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 4585 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 4586 |
cb(KQ_mask, "KQ_mask", -1);
|
| 4587 |
|
| 4588 |
// shift the entire K-cache if needed
|
| 4589 |
if (do_rope_shift) {
|
| 4590 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 4591 |
}
|
| 4592 |
|
| 4593 |
for (int il = 0; il < n_layer; ++il) {
|
|
|
|
| 4623 |
cb(Vcur, "Vcur", il);
|
| 4624 |
}
|
| 4625 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4626 |
Qcur = ggml_rope_custom(
|
| 4627 |
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 4628 |
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
|
|
| 4637 |
);
|
| 4638 |
cb(Kcur, "Kcur", il);
|
| 4639 |
|
| 4640 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
| 4641 |
model.layers[il].wo, model.layers[il].bo,
|
| 4642 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 4643 |
cb(cur, "kqv_out", il);
|
| 4644 |
}
|
| 4645 |
|
|
|
|
| 4758 |
struct ggml_tensor * cur;
|
| 4759 |
struct ggml_tensor * inpL;
|
| 4760 |
|
| 4761 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 4762 |
cb(inpL, "inp_embd", -1);
|
| 4763 |
|
| 4764 |
// inp_pos - contains the positions
|
| 4765 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 4766 |
cb(inp_pos, "inp_pos", -1);
|
| 4767 |
|
| 4768 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 4769 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 4770 |
cb(KQ_mask, "KQ_mask", -1);
|
| 4771 |
|
| 4772 |
// shift the entire K-cache if needed
|
| 4773 |
if (do_rope_shift) {
|
| 4774 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 4775 |
}
|
| 4776 |
|
| 4777 |
for (int il = 0; il < n_layer; ++il) {
|
|
|
|
| 4816 |
cb(Qcur, "Qcur", il);
|
| 4817 |
cb(Kcur, "Kcur", il);
|
| 4818 |
|
|
|
|
| 4819 |
|
| 4820 |
// apply ALiBi for 13B model
|
| 4821 |
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
| 4822 |
|
| 4823 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 4824 |
model.layers[il].wo, NULL,
|
| 4825 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 4826 |
cb(cur, "kqv_out", il);
|
| 4827 |
}
|
| 4828 |
|
|
|
|
| 4879 |
struct ggml_tensor * cur;
|
| 4880 |
struct ggml_tensor * inpL;
|
| 4881 |
|
| 4882 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 4883 |
cb(inpL, "inp_embd", -1);
|
| 4884 |
|
| 4885 |
// inp_pos - contains the positions
|
| 4886 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 4887 |
cb(inp_pos, "inp_pos", -1);
|
| 4888 |
|
| 4889 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 4890 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 4891 |
cb(KQ_mask, "KQ_mask", -1);
|
| 4892 |
|
| 4893 |
// shift the entire K-cache if needed
|
| 4894 |
if (do_rope_shift) {
|
| 4895 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 4896 |
}
|
| 4897 |
|
| 4898 |
for (int il = 0; il < n_layer; ++il) {
|
|
|
|
| 4944 |
);
|
| 4945 |
cb(Kcur, "Kcur", il);
|
| 4946 |
|
| 4947 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
| 4948 |
model.layers[il].wo, NULL,
|
| 4949 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 4950 |
cb(cur, "kqv_out", il);
|
| 4951 |
}
|
| 4952 |
|
|
|
|
| 5001 |
struct ggml_tensor * pos;
|
| 5002 |
struct ggml_tensor * inpL;
|
| 5003 |
|
| 5004 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5005 |
cb(inpL, "inp_embd", -1);
|
| 5006 |
|
| 5007 |
// inp_pos - contains the positions
|
| 5008 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 5009 |
cb(inp_pos, "inp_pos", -1);
|
| 5010 |
|
| 5011 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5012 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5013 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5014 |
|
| 5015 |
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
|
|
| 5043 |
|
| 5044 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 5045 |
|
| 5046 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
| 5047 |
model.layers[il].wo, model.layers[il].bo,
|
| 5048 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5049 |
cb(cur, "kqv_out", il);
|
| 5050 |
}
|
| 5051 |
|
|
|
|
| 5098 |
struct ggml_tensor * cur;
|
| 5099 |
struct ggml_tensor * inpL;
|
| 5100 |
|
| 5101 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5102 |
cb(inpL, "inp_embd", -1);
|
| 5103 |
|
| 5104 |
// inp_pos - contains the positions
|
| 5105 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 5106 |
cb(inp_pos, "inp_pos", -1);
|
| 5107 |
|
| 5108 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5109 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5110 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5111 |
|
| 5112 |
if (do_rope_shift) {
|
| 5113 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 5114 |
}
|
| 5115 |
|
| 5116 |
for (int il = 0; il < n_layer; ++il) {
|
|
|
|
| 5248 |
);
|
| 5249 |
cb(Vcur, "Vcur", il);
|
| 5250 |
|
| 5251 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
|
|
|
| 5252 |
model.layers[il].wo, model.layers[il].bo,
|
| 5253 |
+
Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5254 |
cb(cur, "kqv_out", il);
|
| 5255 |
}
|
| 5256 |
|
|
|
|
| 5305 |
struct ggml_tensor * cur;
|
| 5306 |
struct ggml_tensor * inpL;
|
| 5307 |
|
| 5308 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5309 |
cb(inpL, "inp_embd", -1);
|
| 5310 |
|
| 5311 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5312 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5313 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5314 |
|
| 5315 |
for (int il = 0; il < n_layer; ++il) {
|
|
|
|
| 5337 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 5338 |
cb(Qcur, "Qcur", il);
|
| 5339 |
|
| 5340 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
| 5341 |
model.layers[il].wo, NULL,
|
| 5342 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5343 |
cb(cur, "kqv_out", il);
|
| 5344 |
}
|
| 5345 |
|
|
|
|
| 5395 |
struct ggml_tensor * cur;
|
| 5396 |
struct ggml_tensor * inpL;
|
| 5397 |
|
| 5398 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5399 |
cb(inpL, "inp_embd", -1);
|
| 5400 |
|
| 5401 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5402 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5403 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5404 |
|
| 5405 |
inpL = llm_build_norm(ctx0, inpL, hparams,
|
|
|
|
| 5433 |
|
| 5434 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 5435 |
|
| 5436 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
| 5437 |
model.layers[il].wo, model.layers[il].bo,
|
| 5438 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5439 |
cb(cur, "kqv_out", il);
|
| 5440 |
}
|
| 5441 |
|
|
|
|
| 5488 |
struct ggml_tensor * cur;
|
| 5489 |
struct ggml_tensor * inpL;
|
| 5490 |
|
| 5491 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5492 |
cb(inpL, "inp_embd", -1);
|
| 5493 |
|
| 5494 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5495 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5496 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5497 |
|
| 5498 |
for (int il = 0; il < n_layer; ++il) {
|
|
|
|
| 5526 |
|
| 5527 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 5528 |
|
| 5529 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
| 5530 |
model.layers[il].wo, NULL,
|
| 5531 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5532 |
cb(cur, "kqv_out", il);
|
| 5533 |
}
|
| 5534 |
|
|
|
|
| 5584 |
struct ggml_tensor * cur;
|
| 5585 |
struct ggml_tensor * inpL;
|
| 5586 |
|
| 5587 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5588 |
cb(inpL, "inp_embd", -1);
|
| 5589 |
|
| 5590 |
// inp_pos - contains the positions
|
| 5591 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 5592 |
cb(inp_pos, "inp_pos", -1);
|
| 5593 |
|
| 5594 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5595 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5596 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5597 |
|
| 5598 |
// shift the entire K-cache if needed
|
| 5599 |
if (do_rope_shift) {
|
| 5600 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 5601 |
}
|
| 5602 |
|
| 5603 |
for (int il = 0; il < n_layer; ++il) {
|
|
|
|
| 5615 |
// compute Q and K and RoPE them
|
| 5616 |
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
| 5617 |
cb(Qcur, "Qcur", il);
|
| 5618 |
+
if (model.layers[il].bq) {
|
| 5619 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 5620 |
+
cb(Qcur, "Qcur", il);
|
| 5621 |
+
}
|
| 5622 |
|
| 5623 |
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
| 5624 |
cb(Kcur, "Kcur", il);
|
| 5625 |
+
if (model.layers[il].bk) {
|
| 5626 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 5627 |
+
cb(Kcur, "Kcur", il);
|
| 5628 |
+
}
|
| 5629 |
|
| 5630 |
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
| 5631 |
cb(Vcur, "Vcur", il);
|
| 5632 |
+
if (model.layers[il].bv) {
|
| 5633 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 5634 |
+
cb(Vcur, "Vcur", il);
|
| 5635 |
+
}
|
| 5636 |
|
| 5637 |
Qcur = ggml_rope_custom(
|
| 5638 |
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
|
|
| 5648 |
);
|
| 5649 |
cb(Kcur, "Kcur", il);
|
| 5650 |
|
| 5651 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
| 5652 |
model.layers[il].wo, NULL,
|
| 5653 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5654 |
cb(cur, "kqv_out", il);
|
| 5655 |
}
|
| 5656 |
|
|
|
|
| 5707 |
struct ggml_tensor * cur;
|
| 5708 |
struct ggml_tensor * inpL;
|
| 5709 |
|
| 5710 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5711 |
cb(inpL, "inp_embd", -1);
|
| 5712 |
|
| 5713 |
// inp_pos - contains the positions
|
| 5714 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 5715 |
cb(inp_pos, "inp_pos", -1);
|
| 5716 |
|
| 5717 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5718 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5719 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5720 |
|
| 5721 |
// shift the entire K-cache if needed
|
| 5722 |
if (do_rope_shift) {
|
| 5723 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 5724 |
}
|
| 5725 |
|
| 5726 |
for (int il = 0; il < n_layer; ++il) {
|
|
|
|
| 5763 |
);
|
| 5764 |
cb(Kcur, "Kcur", il);
|
| 5765 |
|
| 5766 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
| 5767 |
model.layers[il].wo, NULL,
|
| 5768 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5769 |
cb(cur, "kqv_out", il);
|
| 5770 |
}
|
| 5771 |
|
|
|
|
| 5810 |
|
| 5811 |
return gf;
|
| 5812 |
}
|
| 5813 |
+
|
| 5814 |
+
struct ggml_cgraph * build_qwen2() {
|
| 5815 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5816 |
+
|
| 5817 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5818 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 5819 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 5820 |
+
|
| 5821 |
+
struct ggml_tensor * cur;
|
| 5822 |
+
struct ggml_tensor * inpL;
|
| 5823 |
+
|
| 5824 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5825 |
+
cb(inpL, "inp_embd", -1);
|
| 5826 |
+
|
| 5827 |
+
// inp_pos - contains the positions
|
| 5828 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 5829 |
+
cb(inp_pos, "inp_pos", -1);
|
| 5830 |
+
|
| 5831 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5832 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5833 |
+
cb(KQ_mask, "KQ_mask", -1);
|
| 5834 |
+
|
| 5835 |
+
// shift the entire K-cache if needed
|
| 5836 |
+
if (do_rope_shift) {
|
| 5837 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 5838 |
+
}
|
| 5839 |
+
|
| 5840 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 5841 |
+
struct ggml_tensor * inpSA = inpL;
|
| 5842 |
+
|
| 5843 |
+
// norm
|
| 5844 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 5845 |
+
model.layers[il].attn_norm, NULL,
|
| 5846 |
+
LLM_NORM_RMS, cb, il);
|
| 5847 |
+
cb(cur, "attn_norm", il);
|
| 5848 |
+
|
| 5849 |
+
// self-attention
|
| 5850 |
+
{
|
| 5851 |
+
// compute Q and K and RoPE them
|
| 5852 |
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
| 5853 |
+
cb(Qcur, "Qcur", il);
|
| 5854 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 5855 |
+
cb(Qcur, "Qcur", il);
|
| 5856 |
+
|
| 5857 |
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
| 5858 |
+
cb(Kcur, "Kcur", il);
|
| 5859 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 5860 |
+
cb(Kcur, "Kcur", il);
|
| 5861 |
+
|
| 5862 |
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
| 5863 |
+
cb(Vcur, "Vcur", il);
|
| 5864 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 5865 |
+
cb(Vcur, "Vcur", il);
|
| 5866 |
+
|
| 5867 |
+
// these nodes are added to the graph together so that they are not reordered
|
| 5868 |
+
// by doing so, the number of splits in the graph is reduced
|
| 5869 |
+
ggml_build_forward_expand(gf, Qcur);
|
| 5870 |
+
ggml_build_forward_expand(gf, Kcur);
|
| 5871 |
+
ggml_build_forward_expand(gf, Vcur);
|
| 5872 |
+
|
| 5873 |
+
Qcur = ggml_rope_custom(
|
| 5874 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 5875 |
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 5876 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 5877 |
+
);
|
| 5878 |
+
cb(Qcur, "Qcur", il);
|
| 5879 |
+
|
| 5880 |
+
Kcur = ggml_rope_custom(
|
| 5881 |
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
| 5882 |
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 5883 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 5884 |
+
);
|
| 5885 |
+
cb(Kcur, "Kcur", il);
|
| 5886 |
+
|
| 5887 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5888 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 5889 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5890 |
+
cb(cur, "kqv_out", il);
|
| 5891 |
+
}
|
| 5892 |
+
|
| 5893 |
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 5894 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 5895 |
+
|
| 5896 |
+
// feed-forward network
|
| 5897 |
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 5898 |
+
model.layers[il].ffn_norm, NULL,
|
| 5899 |
+
LLM_NORM_RMS, cb, il);
|
| 5900 |
+
cb(cur, "ffn_norm", il);
|
| 5901 |
+
|
| 5902 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 5903 |
+
model.layers[il].ffn_up, NULL,
|
| 5904 |
+
model.layers[il].ffn_gate, NULL,
|
| 5905 |
+
model.layers[il].ffn_down, NULL,
|
| 5906 |
+
NULL,
|
| 5907 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 5908 |
+
cb(cur, "ffn_out", il);
|
| 5909 |
+
|
| 5910 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 5911 |
+
cb(cur, "l_out", il);
|
| 5912 |
+
|
| 5913 |
+
// input for next layer
|
| 5914 |
+
inpL = cur;
|
| 5915 |
+
}
|
| 5916 |
+
|
| 5917 |
+
cur = inpL;
|
| 5918 |
+
|
| 5919 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 5920 |
+
model.output_norm, NULL,
|
| 5921 |
+
LLM_NORM_RMS, cb, -1);
|
| 5922 |
+
cb(cur, "result_norm", -1);
|
| 5923 |
+
|
| 5924 |
+
// lm_head
|
| 5925 |
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
| 5926 |
+
cb(cur, "result_output", -1);
|
| 5927 |
+
|
| 5928 |
+
ggml_build_forward_expand(gf, cur);
|
| 5929 |
+
|
| 5930 |
+
return gf;
|
| 5931 |
+
}
|
| 5932 |
+
|
| 5933 |
struct ggml_cgraph * build_phi2() {
|
| 5934 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5935 |
|
|
|
|
| 5942 |
struct ggml_tensor * ffn_output;
|
| 5943 |
struct ggml_tensor * inpL;
|
| 5944 |
|
| 5945 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5946 |
cb(inpL, "inp_embd", -1);
|
| 5947 |
|
| 5948 |
// inp_pos - contains the positions
|
| 5949 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 5950 |
cb(inp_pos, "inp_pos", -1);
|
| 5951 |
|
| 5952 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5953 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5954 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5955 |
|
| 5956 |
// shift the entire K-cache if needed
|
| 5957 |
if (do_rope_shift) {
|
| 5958 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
| 5959 |
}
|
| 5960 |
|
| 5961 |
for (int il = 0; il < n_layer; ++il) {
|
|
|
|
| 6011 |
);
|
| 6012 |
cb(Kcur, "Kcur", il);
|
| 6013 |
|
| 6014 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
| 6015 |
model.layers[il].wo, model.layers[il].bo,
|
| 6016 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
|
| 6017 |
cb(cur, "kqv_out", il);
|
| 6018 |
}
|
| 6019 |
|
|
|
|
| 6064 |
struct ggml_tensor * cur;
|
| 6065 |
struct ggml_tensor * inpL;
|
| 6066 |
|
| 6067 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 6068 |
cb(inpL, "inp_embd", -1);
|
| 6069 |
|
| 6070 |
// inp_pos - contains the positions
|
| 6071 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 6072 |
cb(inp_pos, "inp_pos", -1);
|
| 6073 |
|
| 6074 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 6075 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 6076 |
cb(KQ_mask, "KQ_mask", -1);
|
| 6077 |
|
| 6078 |
// shift the entire K-cache if needed
|
| 6079 |
if (do_rope_shift) {
|
| 6080 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 6081 |
}
|
| 6082 |
|
| 6083 |
for (int il = 0; il < n_layer; ++il) {
|
|
|
|
| 6114 |
ext_factor, attn_factor, beta_fast, beta_slow);
|
| 6115 |
cb(Kcur, "Kcur", il);
|
| 6116 |
|
| 6117 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
|
|
|
|
| 6118 |
model.layers[il].wo, NULL,
|
| 6119 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6120 |
cb(cur, "kqv_out", il);
|
| 6121 |
}
|
| 6122 |
struct ggml_tensor * sa_out = cur;
|
|
|
|
| 6171 |
struct ggml_tensor * pos;
|
| 6172 |
struct ggml_tensor * inpL;
|
| 6173 |
|
| 6174 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 6175 |
cb(inpL, "inp_embd", -1);
|
| 6176 |
|
| 6177 |
// inp_pos - contains the positions
|
| 6178 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 6179 |
cb(inp_pos, "inp_pos", -1);
|
| 6180 |
|
| 6181 |
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 6182 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 6183 |
cb(KQ_mask, "KQ_mask", -1);
|
| 6184 |
|
| 6185 |
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
|
|
| 6213 |
|
| 6214 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 6215 |
|
| 6216 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6217 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 6218 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6219 |
+
cb(cur, "kqv_out", il);
|
| 6220 |
+
}
|
| 6221 |
+
|
| 6222 |
+
// add the input
|
| 6223 |
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
| 6224 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 6225 |
+
|
| 6226 |
+
// FF
|
| 6227 |
+
{
|
| 6228 |
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 6229 |
+
model.layers[il].ffn_norm,
|
| 6230 |
+
model.layers[il].ffn_norm_b,
|
| 6231 |
+
LLM_NORM, cb, il);
|
| 6232 |
+
cb(cur, "ffn_norm", il);
|
| 6233 |
+
|
| 6234 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 6235 |
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
| 6236 |
+
NULL, NULL,
|
| 6237 |
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
| 6238 |
+
NULL,
|
| 6239 |
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
| 6240 |
+
cb(cur, "ffn_out", il);
|
| 6241 |
+
}
|
| 6242 |
+
|
| 6243 |
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
| 6244 |
+
cb(inpL, "l_out", il);
|
| 6245 |
+
}
|
| 6246 |
+
|
| 6247 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 6248 |
+
model.output_norm,
|
| 6249 |
+
model.output_norm_b,
|
| 6250 |
+
LLM_NORM, cb, -1);
|
| 6251 |
+
cb(cur, "result_norm", -1);
|
| 6252 |
+
|
| 6253 |
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
| 6254 |
+
cb(cur, "result_output", -1);
|
| 6255 |
+
|
| 6256 |
+
ggml_build_forward_expand(gf, cur);
|
| 6257 |
+
|
| 6258 |
+
return gf;
|
| 6259 |
+
}
|
| 6260 |
+
|
| 6261 |
+
struct ggml_cgraph * build_codeshell() {
|
| 6262 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 6263 |
+
|
| 6264 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 6265 |
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 6266 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 6267 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 6268 |
+
|
| 6269 |
+
struct ggml_tensor * cur;
|
| 6270 |
+
struct ggml_tensor * inpL;
|
| 6271 |
+
|
| 6272 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 6273 |
+
cb(inpL, "inp_embd", -1);
|
| 6274 |
+
|
| 6275 |
+
// inp_pos - contains the positions
|
| 6276 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 6277 |
+
cb(inp_pos, "inp_pos", -1);
|
| 6278 |
+
|
| 6279 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 6280 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 6281 |
+
cb(KQ_mask, "KQ_mask", -1);
|
| 6282 |
+
|
| 6283 |
+
// shift the entire K-cache if needed
|
| 6284 |
+
if (do_rope_shift) {
|
| 6285 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 6286 |
+
}
|
| 6287 |
+
|
| 6288 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 6289 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 6290 |
+
model.layers[il].attn_norm,
|
| 6291 |
+
model.layers[il].attn_norm_b,
|
| 6292 |
+
LLM_NORM, cb, il);
|
| 6293 |
+
cb(cur, "attn_norm", il);
|
| 6294 |
+
|
| 6295 |
+
// self-attention
|
| 6296 |
+
{
|
| 6297 |
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
| 6298 |
+
cb(cur, "wqkv", il);
|
| 6299 |
+
|
| 6300 |
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
| 6301 |
+
cb(cur, "bqkv", il);
|
| 6302 |
+
|
| 6303 |
+
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
| 6304 |
+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
| 6305 |
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
| 6306 |
+
|
| 6307 |
+
cb(tmpq, "tmpq", il);
|
| 6308 |
+
cb(tmpk, "tmpk", il);
|
| 6309 |
+
cb(Vcur, "Vcur", il);
|
| 6310 |
+
|
| 6311 |
+
struct ggml_tensor * Qcur = ggml_rope_custom(
|
| 6312 |
+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
| 6313 |
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 6314 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 6315 |
+
);
|
| 6316 |
+
cb(Qcur, "Qcur", il);
|
| 6317 |
+
|
| 6318 |
+
struct ggml_tensor * Kcur = ggml_rope_custom(
|
| 6319 |
+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
| 6320 |
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 6321 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 6322 |
+
);
|
| 6323 |
+
cb(Kcur, "Kcur", il);
|
| 6324 |
|
| 6325 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6326 |
model.layers[il].wo, model.layers[il].bo,
|
| 6327 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6328 |
cb(cur, "kqv_out", il);
|
| 6329 |
}
|
| 6330 |
|
|
|
|
| 6376 |
// check if we should build the worst-case graph (for memory measurement)
|
| 6377 |
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
| 6378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6379 |
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
|
|
|
| 6380 |
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
| 6381 |
if (il >= 0) {
|
| 6382 |
ggml_format_name(cur, "%s-%d", name, il);
|
|
|
|
| 6384 |
ggml_set_name(cur, name);
|
| 6385 |
}
|
| 6386 |
|
| 6387 |
+
if (!lctx.cparams.offload_kqv) {
|
| 6388 |
+
if (strcmp(name, "kqv_merged_cont") == 0) {
|
| 6389 |
+
// all nodes between the KV store and the attention output are run on the CPU
|
| 6390 |
+
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6391 |
}
|
|
|
|
|
|
|
| 6392 |
}
|
| 6393 |
+
};
|
| 6394 |
+
|
| 6395 |
+
struct ggml_cgraph * result = NULL;
|
| 6396 |
|
| 6397 |
+
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
|
|
|
| 6398 |
|
| 6399 |
+
//
|
| 6400 |
+
// set input data
|
| 6401 |
+
//
|
| 6402 |
|
| 6403 |
+
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
| 6404 |
+
if (batch.token) {
|
| 6405 |
+
const int64_t n_tokens = batch.n_tokens;
|
| 6406 |
|
| 6407 |
+
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
| 6408 |
}
|
| 6409 |
|
| 6410 |
+
if (batch.embd) {
|
| 6411 |
+
const int64_t n_embd = llm.n_embd;
|
| 6412 |
+
const int64_t n_tokens = batch.n_tokens;
|
| 6413 |
|
| 6414 |
+
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
| 6415 |
+
}
|
| 6416 |
|
| 6417 |
+
if (batch.pos) {
|
| 6418 |
+
const int64_t n_tokens = batch.n_tokens;
|
|
|
|
| 6419 |
|
| 6420 |
+
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
| 6421 |
}
|
| 6422 |
|
| 6423 |
+
{
|
| 6424 |
+
const int64_t n_kv = llm.n_kv;
|
| 6425 |
+
const int64_t n_tokens = batch.n_tokens;
|
| 6426 |
|
| 6427 |
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
| 6428 |
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
|
|
|
| 6429 |
|
| 6430 |
+
for (int h = 0; h < 1; ++h) {
|
| 6431 |
+
for (int j = 0; j < n_tokens; ++j) {
|
| 6432 |
+
const llama_pos pos = batch.pos[j];
|
| 6433 |
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
|
|
|
|
|
|
|
|
|
| 6434 |
|
| 6435 |
+
for (int i = 0; i < n_kv; ++i) {
|
| 6436 |
+
float f;
|
| 6437 |
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
| 6438 |
+
f = -INFINITY;
|
| 6439 |
+
} else {
|
| 6440 |
+
f = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6441 |
}
|
| 6442 |
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
| 6443 |
}
|
| 6444 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6445 |
}
|
|
|
|
|
|
|
| 6446 |
}
|
| 6447 |
|
| 6448 |
+
if (llm.do_rope_shift) {
|
| 6449 |
+
const int64_t n_ctx = llm.n_ctx;
|
|
|
|
|
|
|
|
|
|
| 6450 |
|
| 6451 |
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
| 6452 |
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6453 |
|
| 6454 |
+
for (int i = 0; i < n_ctx; ++i) {
|
| 6455 |
+
data[i] = lctx.kv_self.cells[i].delta;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6456 |
}
|
|
|
|
|
|
|
| 6457 |
}
|
| 6458 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6459 |
|
| 6460 |
llm.init();
|
| 6461 |
|
|
|
|
| 6500 |
{
|
| 6501 |
result = llm.build_qwen();
|
| 6502 |
} break;
|
| 6503 |
+
case LLM_ARCH_QWEN2:
|
| 6504 |
+
{
|
| 6505 |
+
result = llm.build_qwen2();
|
| 6506 |
+
} break;
|
| 6507 |
case LLM_ARCH_PHI2:
|
| 6508 |
{
|
| 6509 |
result = llm.build_phi2();
|
|
|
|
| 6516 |
{
|
| 6517 |
result = llm.build_gpt2();
|
| 6518 |
} break;
|
| 6519 |
+
case LLM_ARCH_CODESHELL:
|
| 6520 |
+
{
|
| 6521 |
+
result = llm.build_codeshell();
|
| 6522 |
+
} break;
|
| 6523 |
default:
|
| 6524 |
GGML_ASSERT(false);
|
| 6525 |
}
|
|
|
|
| 7956 |
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
| 7957 |
return a.logit > b.logit;
|
| 7958 |
};
|
| 7959 |
+
if (k <= 128) {
|
|
|
|
|
|
|
| 7960 |
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
| 7961 |
+
} else {
|
| 7962 |
+
constexpr int nbuckets = 128;
|
| 7963 |
+
constexpr float bucket_low = -10.0f;
|
| 7964 |
+
constexpr float bucket_high = 10.0f;
|
| 7965 |
+
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
| 7966 |
+
constexpr float bucker_inter = -bucket_low * bucket_scale;
|
| 7967 |
+
|
| 7968 |
+
std::vector<int> bucket_idx(candidates->size);
|
| 7969 |
+
std::vector<int> histo(nbuckets, 0);
|
| 7970 |
+
|
| 7971 |
+
for (int i = 0; i < (int)candidates->size; ++i) {
|
| 7972 |
+
const float val = candidates->data[i].logit;
|
| 7973 |
+
int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
| 7974 |
+
ib = std::max(0, std::min(nbuckets-1, ib));
|
| 7975 |
+
bucket_idx[i] = ib;
|
| 7976 |
+
++histo[ib];
|
| 7977 |
+
}
|
| 7978 |
+
int nhave = 0;
|
| 7979 |
+
int ib = nbuckets - 1;
|
| 7980 |
+
for ( ; ib >= 0; --ib) {
|
| 7981 |
+
nhave += histo[ib];
|
| 7982 |
+
if (nhave >= k) break;
|
| 7983 |
+
}
|
| 7984 |
+
std::vector<llama_token_data> tmp_tokens(nhave);
|
| 7985 |
+
auto ptr = tmp_tokens.data();
|
| 7986 |
+
std::vector<llama_token_data*> bucket_ptrs;
|
| 7987 |
+
bucket_ptrs.reserve(nbuckets - ib);
|
| 7988 |
+
for (int j = nbuckets - 1; j >= ib; --j) {
|
| 7989 |
+
bucket_ptrs.push_back(ptr);
|
| 7990 |
+
ptr += histo[j];
|
| 7991 |
+
}
|
| 7992 |
+
for (int i = 0; i < (int)candidates->size; ++i) {
|
| 7993 |
+
int j = bucket_idx[i];
|
| 7994 |
+
if (j >= ib) {
|
| 7995 |
+
*bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
|
| 7996 |
+
}
|
| 7997 |
+
}
|
| 7998 |
+
|
| 7999 |
+
ptr = tmp_tokens.data();
|
| 8000 |
+
int ndone = 0;
|
| 8001 |
+
for (int j = nbuckets-1; j > ib; --j) {
|
| 8002 |
+
std::sort(ptr, ptr + histo[j], comp);
|
| 8003 |
+
ptr += histo[j];
|
| 8004 |
+
ndone += histo[j];
|
| 8005 |
+
}
|
| 8006 |
+
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
| 8007 |
+
|
| 8008 |
+
std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
| 8009 |
+
|
| 8010 |
}
|
| 8011 |
candidates->sorted = true;
|
| 8012 |
}
|
|
|
|
| 8198 |
}
|
| 8199 |
}
|
| 8200 |
|
| 8201 |
+
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
|
| 8202 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
| 8203 |
+
|
| 8204 |
+
// no need to do anything if there is only one (or zero) candidates
|
| 8205 |
+
if(candidates_p->size <= 1) {
|
| 8206 |
+
return;
|
| 8207 |
+
}
|
| 8208 |
+
|
| 8209 |
+
// Calculate maximum possible entropy
|
| 8210 |
+
float max_entropy = -logf(1.0f / candidates_p->size);
|
| 8211 |
+
|
| 8212 |
+
llama_sample_softmax(nullptr, candidates_p);
|
| 8213 |
+
|
| 8214 |
+
// Calculate entropy of the softmax probabilities
|
| 8215 |
+
float entropy = 0.0f;
|
| 8216 |
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
| 8217 |
+
float prob = candidates_p->data[i].p;
|
| 8218 |
+
if (prob > 0.0f) { // Ensure no log(0)
|
| 8219 |
+
entropy -= prob * logf(prob);
|
| 8220 |
+
}
|
| 8221 |
+
}
|
| 8222 |
+
|
| 8223 |
+
// Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
|
| 8224 |
+
float normalized_entropy = entropy / max_entropy;
|
| 8225 |
+
|
| 8226 |
+
// Map the normalized entropy to the desired temperature range using the power function
|
| 8227 |
+
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
|
| 8228 |
+
|
| 8229 |
+
#ifdef DEBUG
|
| 8230 |
+
LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
|
| 8231 |
+
LLAMA_LOG_INFO("Entropy: %f\n", entropy);
|
| 8232 |
+
LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
|
| 8233 |
+
LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
|
| 8234 |
+
LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
|
| 8235 |
+
LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
|
| 8236 |
+
#endif
|
| 8237 |
+
|
| 8238 |
+
// Apply the dynamically calculated temperature scaling
|
| 8239 |
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
| 8240 |
+
candidates_p->data[i].logit /= dyn_temp;
|
| 8241 |
+
}
|
| 8242 |
+
|
| 8243 |
+
// Re-compute softmax probabilities after scaling logits with dynamic temperature
|
| 8244 |
+
double max_l_double = candidates_p->data[0].logit;
|
| 8245 |
+
double cum_sum_double = 0.0;
|
| 8246 |
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
| 8247 |
+
double p = exp(candidates_p->data[i].logit - max_l_double);
|
| 8248 |
+
candidates_p->data[i].p = p; // Store the scaled probability
|
| 8249 |
+
cum_sum_double += p;
|
| 8250 |
+
}
|
| 8251 |
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
| 8252 |
+
candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
| 8253 |
+
}
|
| 8254 |
+
|
| 8255 |
+
#ifdef DEBUG
|
| 8256 |
+
// Print the updated top 25 probabilities after temperature scaling
|
| 8257 |
+
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
|
| 8258 |
+
for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
|
| 8259 |
+
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
|
| 8260 |
+
}
|
| 8261 |
+
#endif
|
| 8262 |
+
|
| 8263 |
+
if (ctx) {
|
| 8264 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 8265 |
+
}
|
| 8266 |
+
}
|
| 8267 |
+
|
| 8268 |
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
| 8269 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 8270 |
|
|
|
|
| 8853 |
const llama_model_quantize_params * params;
|
| 8854 |
|
| 8855 |
int n_attention_wv = 0;
|
| 8856 |
+
int n_ffn_down = 0;
|
| 8857 |
+
int n_ffn_gate = 0;
|
| 8858 |
+
int n_ffn_up = 0;
|
| 8859 |
int i_attention_wv = 0;
|
| 8860 |
+
int i_ffn_down = 0;
|
| 8861 |
+
int i_ffn_gate = 0;
|
| 8862 |
+
int i_ffn_up = 0;
|
| 8863 |
|
| 8864 |
int n_k_quantized = 0;
|
| 8865 |
int n_fallback = 0;
|
|
|
|
| 8943 |
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
| 8944 |
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
| 8945 |
};
|
| 8946 |
+
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
| 8947 |
+
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
| 8948 |
+
if (n_expert > 1) {
|
| 8949 |
+
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
| 8950 |
+
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
| 8951 |
+
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
| 8952 |
+
// tensor name.
|
| 8953 |
+
n_layer /= n_expert;
|
| 8954 |
+
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
| 8955 |
+
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
| 8956 |
+
}
|
| 8957 |
+
if (i_layer < 0 || i_layer >= n_layer) {
|
| 8958 |
+
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
|
| 8959 |
+
}
|
| 8960 |
+
}
|
| 8961 |
+
return std::make_pair(i_layer, n_layer);
|
| 8962 |
+
};
|
| 8963 |
|
| 8964 |
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
| 8965 |
int nx = tensor->ne[0];
|
|
|
|
| 8979 |
++qs.i_attention_wv;
|
| 8980 |
}
|
| 8981 |
else if (name.find("ffn_down") != std::string::npos) {
|
| 8982 |
+
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
| 8983 |
+
++qs.i_ffn_down;
|
| 8984 |
}
|
| 8985 |
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
| 8986 |
} else if (name.find("attn_v.weight") != std::string::npos) {
|
|
|
|
| 9017 |
// TODO: explore better strategies
|
| 9018 |
new_type = GGML_TYPE_Q8_0;
|
| 9019 |
}
|
| 9020 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
| 9021 |
+
new_type = GGML_TYPE_Q2_K;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9022 |
}
|
| 9023 |
+
} else if (name.find("ffn_down") != std::string::npos) {
|
| 9024 |
+
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
| 9025 |
+
int i_layer = info.first, n_layer = info.second;
|
| 9026 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 9027 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
| 9028 |
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
| 9029 |
}
|
| 9030 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
|
|
| 9054 |
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
|
| 9055 |
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
| 9056 |
}
|
| 9057 |
+
++qs.i_ffn_down;
|
| 9058 |
} else if (name.find("attn_output.weight") != std::string::npos) {
|
| 9059 |
if (arch != LLM_ARCH_FALCON) {
|
| 9060 |
if (qs.model.hparams.n_expert == 8) {
|
| 9061 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
|
| 9062 |
+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
| 9063 |
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 9064 |
new_type = GGML_TYPE_Q5_K;
|
| 9065 |
}
|
|
|
|
| 9077 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
| 9078 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
| 9079 |
}
|
| 9080 |
+
else if (name.find("ffn_gate") != std::string::npos) {
|
| 9081 |
+
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
| 9082 |
+
int i_layer = info.first, n_layer = info.second;
|
| 9083 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
|
| 9084 |
+
new_type = GGML_TYPE_Q2_K;
|
| 9085 |
+
}
|
| 9086 |
+
++qs.i_ffn_gate;
|
| 9087 |
+
}
|
| 9088 |
+
else if (name.find("ffn_up") != std::string::npos) {
|
| 9089 |
+
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
| 9090 |
+
int i_layer = info.first, n_layer = info.second;
|
| 9091 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
|
| 9092 |
+
new_type = GGML_TYPE_Q2_K;
|
| 9093 |
+
}
|
| 9094 |
+
++qs.i_ffn_up;
|
| 9095 |
+
}
|
| 9096 |
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 9097 |
+
//}
|
| 9098 |
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
| 9099 |
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
|
| 9100 |
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
|
|
| 9149 |
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
| 9150 |
|
| 9151 |
// K-quants
|
| 9152 |
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
| 9153 |
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
| 9154 |
+
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
| 9155 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
| 9156 |
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
| 9157 |
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
|
|
|
| 9219 |
++qs.n_attention_wv;
|
| 9220 |
}
|
| 9221 |
else if (name.find("ffn_down") != std::string::npos) {
|
| 9222 |
+
++qs.n_ffn_down;
|
| 9223 |
+
}
|
| 9224 |
+
else if (name.find("ffn_gate") != std::string::npos) {
|
| 9225 |
+
++qs.n_ffn_gate;
|
| 9226 |
+
}
|
| 9227 |
+
else if (name.find("ffn_up") != std::string::npos) {
|
| 9228 |
+
++qs.n_ffn_up;
|
| 9229 |
}
|
| 9230 |
}
|
| 9231 |
+
if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
|
| 9232 |
+
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
|
| 9233 |
+
__func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
|
| 9234 |
}
|
| 9235 |
|
| 9236 |
size_t total_size_org = 0;
|
|
|
|
| 10038 |
ctx->embedding.resize(hparams.n_embd);
|
| 10039 |
}
|
| 10040 |
|
| 10041 |
+
// graph inputs
|
| 10042 |
+
{
|
| 10043 |
+
ggml_init_params init_params = {
|
| 10044 |
+
/* .mem_size */ ggml_tensor_overhead()*5,
|
| 10045 |
+
/* .mem_buffer */ nullptr,
|
| 10046 |
+
/* .no_alloc */ true,
|
| 10047 |
+
};
|
| 10048 |
+
ctx->ctx_input = ggml_init(init_params);
|
| 10049 |
+
|
| 10050 |
+
ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
| 10051 |
+
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
| 10052 |
+
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
| 10053 |
+
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
| 10054 |
+
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
| 10055 |
+
|
| 10056 |
+
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
| 10057 |
+
ggml_set_name(ctx->inp_embd, "inp_embd");
|
| 10058 |
+
ggml_set_name(ctx->inp_pos, "inp_pos");
|
| 10059 |
+
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
| 10060 |
+
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
| 10061 |
+
|
| 10062 |
+
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
| 10063 |
+
|
| 10064 |
+
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
| 10065 |
+
ggml_backend_buffer_name(ctx->buf_input),
|
| 10066 |
+
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
| 10067 |
+
}
|
| 10068 |
+
|
| 10069 |
+
// scheduler and compute buffers
|
| 10070 |
{
|
| 10071 |
// buffer types used for the compute buffer of each backend
|
| 10072 |
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
|
|
|
| 10093 |
|
| 10094 |
// initialize scheduler with the worst-case graph
|
| 10095 |
ggml_backend_sched_init_measure(ctx->sched, gf);
|
|
|
|
|
|
|
|
|
|
| 10096 |
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
| 10097 |
|
| 10098 |
for (ggml_backend_t backend : ctx->backends) {
|
|
|
|
| 10101 |
ggml_backend_buffer_name(buf),
|
| 10102 |
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
| 10103 |
}
|
| 10104 |
+
|
| 10105 |
+
// note: the number of splits during measure is higher than during inference due to the kv shift
|
| 10106 |
+
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
| 10107 |
+
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
| 10108 |
}
|
| 10109 |
}
|
| 10110 |
|
examples/talk-llama/llama.h
CHANGED
|
@@ -107,6 +107,7 @@ extern "C" {
|
|
| 107 |
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
| 108 |
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
| 109 |
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
|
|
|
| 110 |
|
| 111 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 112 |
};
|
|
@@ -774,6 +775,14 @@ extern "C" {
|
|
| 774 |
float p,
|
| 775 |
size_t min_keep);
|
| 776 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
LLAMA_API void llama_sample_temp(
|
| 778 |
struct llama_context * ctx,
|
| 779 |
llama_token_data_array * candidates,
|
|
|
|
| 107 |
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
| 108 |
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
| 109 |
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
| 110 |
+
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
| 111 |
|
| 112 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 113 |
};
|
|
|
|
| 775 |
float p,
|
| 776 |
size_t min_keep);
|
| 777 |
|
| 778 |
+
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
|
| 779 |
+
LLAMA_API void llama_sample_entropy(
|
| 780 |
+
struct llama_context * ctx,
|
| 781 |
+
llama_token_data_array * candidates_p,
|
| 782 |
+
float min_temp,
|
| 783 |
+
float max_temp,
|
| 784 |
+
float exponent_val);
|
| 785 |
+
|
| 786 |
LLAMA_API void llama_sample_temp(
|
| 787 |
struct llama_context * ctx,
|
| 788 |
llama_token_data_array * candidates,
|
examples/talk-llama/unicode.h
CHANGED
|
@@ -2,8 +2,9 @@
|
|
| 2 |
|
| 3 |
#include <cassert>
|
| 4 |
#include <stdexcept>
|
| 5 |
-
#include <
|
| 6 |
#include <unordered_map>
|
|
|
|
| 7 |
|
| 8 |
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
|
| 9 |
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
|
|
|
|
| 2 |
|
| 3 |
#include <cassert>
|
| 4 |
#include <stdexcept>
|
| 5 |
+
#include <string>
|
| 6 |
#include <unordered_map>
|
| 7 |
+
#include <vector>
|
| 8 |
|
| 9 |
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
|
| 10 |
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
|