Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +264 -34
- examples/talk-llama/llama.h +2 -1
examples/talk-llama/llama.cpp
CHANGED
|
@@ -208,6 +208,7 @@ enum llm_arch {
|
|
| 208 |
LLM_ARCH_ORION,
|
| 209 |
LLM_ARCH_INTERNLM2,
|
| 210 |
LLM_ARCH_MINICPM,
|
|
|
|
| 211 |
LLM_ARCH_UNKNOWN,
|
| 212 |
};
|
| 213 |
|
|
@@ -234,6 +235,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 234 |
{ LLM_ARCH_ORION, "orion" },
|
| 235 |
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
| 236 |
{ LLM_ARCH_MINICPM, "minicpm" },
|
|
|
|
| 237 |
};
|
| 238 |
|
| 239 |
enum llm_kv {
|
|
@@ -760,6 +762,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
| 760 |
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
| 761 |
},
|
| 762 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 763 |
{
|
| 764 |
LLM_ARCH_UNKNOWN,
|
| 765 |
{
|
|
@@ -2527,6 +2545,7 @@ struct llama_model_loader {
|
|
| 2527 |
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
| 2528 |
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
| 2529 |
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
|
|
|
| 2530 |
default:
|
| 2531 |
{
|
| 2532 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
@@ -2772,13 +2791,7 @@ struct llama_model_loader {
|
|
| 2772 |
|
| 2773 |
std::vector<no_init<uint8_t>> read_buf;
|
| 2774 |
|
| 2775 |
-
for (
|
| 2776 |
-
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
| 2777 |
-
if (!cur) {
|
| 2778 |
-
// some tensors may be allocated in a different context
|
| 2779 |
-
continue;
|
| 2780 |
-
}
|
| 2781 |
-
|
| 2782 |
if (progress_callback) {
|
| 2783 |
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
| 2784 |
return false;
|
|
@@ -2877,6 +2890,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 2877 |
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
| 2878 |
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
| 2879 |
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
|
|
|
| 2880 |
|
| 2881 |
default: return "unknown, may not work";
|
| 2882 |
}
|
|
@@ -3241,6 +3255,16 @@ static void llm_load_hparams(
|
|
| 3241 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3242 |
}
|
| 3243 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3244 |
default: (void)0;
|
| 3245 |
}
|
| 3246 |
|
|
@@ -3692,7 +3716,7 @@ static bool llm_load_tensors(
|
|
| 3692 |
}
|
| 3693 |
|
| 3694 |
// create one context per buffer type
|
| 3695 |
-
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
| 3696 |
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
| 3697 |
for (auto & it : buft_layer_count) {
|
| 3698 |
struct ggml_init_params params = {
|
|
@@ -3830,6 +3854,7 @@ static bool llm_load_tensors(
|
|
| 3830 |
} else {
|
| 3831 |
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
| 3832 |
ml.n_created--; // artificial tensor
|
|
|
|
| 3833 |
}
|
| 3834 |
}
|
| 3835 |
|
|
@@ -4029,6 +4054,8 @@ static bool llm_load_tensors(
|
|
| 4029 |
// output
|
| 4030 |
{
|
| 4031 |
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
|
|
|
|
|
| 4032 |
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
| 4033 |
}
|
| 4034 |
|
|
@@ -4038,14 +4065,23 @@ static bool llm_load_tensors(
|
|
| 4038 |
|
| 4039 |
auto & layer = model.layers[i];
|
| 4040 |
|
| 4041 |
-
layer.attn_norm
|
|
|
|
| 4042 |
|
| 4043 |
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
|
|
|
|
|
| 4044 |
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
|
|
| 4045 |
|
| 4046 |
-
layer.ffn_norm
|
| 4047 |
-
layer.
|
| 4048 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4049 |
|
| 4050 |
// AWQ ScaleActivation layer
|
| 4051 |
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
|
@@ -4358,6 +4394,40 @@ static bool llm_load_tensors(
|
|
| 4358 |
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4359 |
}
|
| 4360 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4361 |
default:
|
| 4362 |
throw std::runtime_error("unknown architecture");
|
| 4363 |
}
|
|
@@ -6112,7 +6182,7 @@ struct llm_build_context {
|
|
| 6112 |
|
| 6113 |
attn_norm = llm_build_norm(ctx0, inpL, hparams,
|
| 6114 |
model.layers[il].attn_norm,
|
| 6115 |
-
|
| 6116 |
LLM_NORM, cb, il);
|
| 6117 |
cb(attn_norm, "attn_norm", il);
|
| 6118 |
|
|
@@ -6123,6 +6193,11 @@ struct llm_build_context {
|
|
| 6123 |
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
| 6124 |
cb(cur, "wqkv", il);
|
| 6125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6126 |
if (hparams.f_clamp_kqv > 0.0f) {
|
| 6127 |
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
| 6128 |
cb(cur, "wqkv_clamped", il);
|
|
@@ -6139,7 +6214,7 @@ struct llm_build_context {
|
|
| 6139 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 6140 |
|
| 6141 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6142 |
-
model.layers[il].wo,
|
| 6143 |
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6144 |
cb(cur, "kqv_out", il);
|
| 6145 |
}
|
|
@@ -6152,13 +6227,13 @@ struct llm_build_context {
|
|
| 6152 |
{
|
| 6153 |
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 6154 |
model.layers[il].ffn_norm,
|
| 6155 |
-
|
| 6156 |
LLM_NORM, cb, il);
|
| 6157 |
cb(cur, "ffn_norm", il);
|
| 6158 |
cur = llm_build_ffn(ctx0, cur,
|
| 6159 |
-
model.layers[il].ffn_up,
|
| 6160 |
NULL, NULL,
|
| 6161 |
-
model.layers[il].ffn_down,
|
| 6162 |
model.layers[il].ffn_act,
|
| 6163 |
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
| 6164 |
cb(cur, "ffn_out", il);
|
|
@@ -6175,7 +6250,7 @@ struct llm_build_context {
|
|
| 6175 |
|
| 6176 |
cur = llm_build_norm(ctx0, cur, hparams,
|
| 6177 |
model.output_norm,
|
| 6178 |
-
|
| 6179 |
LLM_NORM, cb, -1);
|
| 6180 |
cb(cur, "result_norm", -1);
|
| 6181 |
|
|
@@ -7364,6 +7439,116 @@ struct llm_build_context {
|
|
| 7364 |
|
| 7365 |
return gf;
|
| 7366 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7367 |
};
|
| 7368 |
|
| 7369 |
static struct ggml_cgraph * llama_build_graph(
|
|
@@ -7472,6 +7657,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 7472 |
{
|
| 7473 |
result = llm.build_minicpm();
|
| 7474 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7475 |
default:
|
| 7476 |
GGML_ASSERT(false);
|
| 7477 |
}
|
|
@@ -10309,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10309 |
return std::make_pair(i_layer, n_layer);
|
| 10310 |
};
|
| 10311 |
|
| 10312 |
-
|
|
|
|
|
|
|
|
|
|
| 10313 |
int nx = tensor->ne[0];
|
| 10314 |
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
| 10315 |
new_type = GGML_TYPE_Q8_0;
|
|
@@ -10354,6 +10546,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10354 |
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 10355 |
}
|
| 10356 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
|
|
|
|
|
|
|
|
| 10357 |
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
| 10358 |
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
| 10359 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
|
@@ -10406,6 +10601,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10406 |
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
| 10407 |
}
|
| 10408 |
}
|
|
|
|
|
|
|
|
|
|
| 10409 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
| 10410 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
| 10411 |
new_type = GGML_TYPE_Q5_K;
|
|
@@ -10422,7 +10620,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10422 |
if (arch != LLM_ARCH_FALCON) {
|
| 10423 |
if (qs.model.hparams.n_expert == 8) {
|
| 10424 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
| 10425 |
-
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
| 10426 |
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 10427 |
new_type = GGML_TYPE_Q5_K;
|
| 10428 |
}
|
|
@@ -10489,8 +10687,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10489 |
case GGML_TYPE_IQ2_XS:
|
| 10490 |
case GGML_TYPE_IQ3_XXS:
|
| 10491 |
case GGML_TYPE_IQ1_S:
|
| 10492 |
-
case GGML_TYPE_Q2_K:
|
| 10493 |
-
case GGML_TYPE_Q3_K: new_type =
|
| 10494 |
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
| 10495 |
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
| 10496 |
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
|
@@ -10531,7 +10729,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 10531 |
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
| 10532 |
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
| 10533 |
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
| 10534 |
-
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S
|
|
|
|
| 10535 |
|
| 10536 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 10537 |
}
|
|
@@ -11995,18 +12194,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
| 11995 |
data_ctx->write(&kv_used, sizeof(kv_used));
|
| 11996 |
|
| 11997 |
if (kv_buf_size) {
|
| 11998 |
-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
| 11999 |
-
|
| 12000 |
std::vector<uint8_t> tmp_buf;
|
| 12001 |
for (int il = 0; il < (int) n_layer; ++il) {
|
| 12002 |
-
|
|
|
|
| 12003 |
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
| 12004 |
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
| 12005 |
|
| 12006 |
// v is not contiguous, copy row by row
|
| 12007 |
-
|
|
|
|
|
|
|
| 12008 |
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
| 12009 |
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*
|
| 12010 |
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
| 12011 |
}
|
| 12012 |
}
|
|
@@ -12108,17 +12308,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
| 12108 |
if (kv_buf_size) {
|
| 12109 |
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
| 12110 |
|
| 12111 |
-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
| 12112 |
-
|
| 12113 |
for (int il = 0; il < (int) n_layer; ++il) {
|
| 12114 |
-
size_t k_size =
|
| 12115 |
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
| 12116 |
inp += k_size;
|
| 12117 |
|
| 12118 |
// v is not contiguous, copy row by row
|
| 12119 |
-
size_t v_row_size =
|
|
|
|
| 12120 |
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
| 12121 |
-
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*
|
| 12122 |
inp += v_row_size;
|
| 12123 |
}
|
| 12124 |
}
|
|
@@ -12580,6 +12779,37 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 12580 |
if (add_ass) {
|
| 12581 |
ss << "<|assistant|>\n";
|
| 12582 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12583 |
} else {
|
| 12584 |
// template not supported
|
| 12585 |
return -1;
|
|
@@ -12602,7 +12832,7 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
| 12602 |
// load template from model
|
| 12603 |
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
| 12604 |
std::string template_key = "tokenizer.chat_template";
|
| 12605 |
-
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(),
|
| 12606 |
if (res < 0) {
|
| 12607 |
// worst case: there is no information about template, we will use chatml by default
|
| 12608 |
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
|
|
|
| 208 |
LLM_ARCH_ORION,
|
| 209 |
LLM_ARCH_INTERNLM2,
|
| 210 |
LLM_ARCH_MINICPM,
|
| 211 |
+
LLM_ARCH_GEMMA,
|
| 212 |
LLM_ARCH_UNKNOWN,
|
| 213 |
};
|
| 214 |
|
|
|
|
| 235 |
{ LLM_ARCH_ORION, "orion" },
|
| 236 |
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
| 237 |
{ LLM_ARCH_MINICPM, "minicpm" },
|
| 238 |
+
{ LLM_ARCH_GEMMA, "gemma" },
|
| 239 |
};
|
| 240 |
|
| 241 |
enum llm_kv {
|
|
|
|
| 762 |
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
| 763 |
},
|
| 764 |
},
|
| 765 |
+
{
|
| 766 |
+
LLM_ARCH_GEMMA,
|
| 767 |
+
{
|
| 768 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 769 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 770 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 771 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 772 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 773 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 774 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 775 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 776 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 777 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 778 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 779 |
+
},
|
| 780 |
+
},
|
| 781 |
{
|
| 782 |
LLM_ARCH_UNKNOWN,
|
| 783 |
{
|
|
|
|
| 2545 |
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
| 2546 |
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
| 2547 |
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
| 2548 |
+
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
| 2549 |
default:
|
| 2550 |
{
|
| 2551 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
|
|
| 2791 |
|
| 2792 |
std::vector<no_init<uint8_t>> read_buf;
|
| 2793 |
|
| 2794 |
+
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2795 |
if (progress_callback) {
|
| 2796 |
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
| 2797 |
return false;
|
|
|
|
| 2890 |
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
| 2891 |
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
| 2892 |
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
| 2893 |
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
| 2894 |
|
| 2895 |
default: return "unknown, may not work";
|
| 2896 |
}
|
|
|
|
| 3255 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3256 |
}
|
| 3257 |
} break;
|
| 3258 |
+
case LLM_ARCH_GEMMA:
|
| 3259 |
+
{
|
| 3260 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 3261 |
+
|
| 3262 |
+
switch (hparams.n_layer) {
|
| 3263 |
+
case 18: model.type = e_model::MODEL_2B; break;
|
| 3264 |
+
case 28: model.type = e_model::MODEL_7B; break;
|
| 3265 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3266 |
+
}
|
| 3267 |
+
} break;
|
| 3268 |
default: (void)0;
|
| 3269 |
}
|
| 3270 |
|
|
|
|
| 3716 |
}
|
| 3717 |
|
| 3718 |
// create one context per buffer type
|
| 3719 |
+
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
| 3720 |
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
| 3721 |
for (auto & it : buft_layer_count) {
|
| 3722 |
struct ggml_init_params params = {
|
|
|
|
| 3854 |
} else {
|
| 3855 |
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
| 3856 |
ml.n_created--; // artificial tensor
|
| 3857 |
+
ml.size_data += ggml_nbytes(model.output);
|
| 3858 |
}
|
| 3859 |
}
|
| 3860 |
|
|
|
|
| 4054 |
// output
|
| 4055 |
{
|
| 4056 |
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 4057 |
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
| 4058 |
+
|
| 4059 |
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
| 4060 |
}
|
| 4061 |
|
|
|
|
| 4065 |
|
| 4066 |
auto & layer = model.layers[i];
|
| 4067 |
|
| 4068 |
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 4069 |
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
|
| 4070 |
|
| 4071 |
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
| 4072 |
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
| 4073 |
+
|
| 4074 |
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 4075 |
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
|
| 4076 |
|
| 4077 |
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 4078 |
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
| 4079 |
+
|
| 4080 |
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
| 4081 |
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
|
| 4082 |
+
|
| 4083 |
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4084 |
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
| 4085 |
|
| 4086 |
// AWQ ScaleActivation layer
|
| 4087 |
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
|
|
|
| 4394 |
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4395 |
}
|
| 4396 |
} break;
|
| 4397 |
+
case LLM_ARCH_GEMMA:
|
| 4398 |
+
{
|
| 4399 |
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 4400 |
+
|
| 4401 |
+
// output
|
| 4402 |
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 4403 |
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
| 4404 |
+
ml.n_created--; // artificial tensor
|
| 4405 |
+
ml.size_data += ggml_nbytes(model.output);
|
| 4406 |
+
|
| 4407 |
+
const int64_t n_ff = hparams.n_ff;
|
| 4408 |
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
| 4409 |
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
| 4410 |
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
| 4411 |
+
|
| 4412 |
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
| 4413 |
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
| 4414 |
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
| 4415 |
+
|
| 4416 |
+
auto & layer = model.layers[i];
|
| 4417 |
+
|
| 4418 |
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 4419 |
+
|
| 4420 |
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
| 4421 |
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
| 4422 |
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
| 4423 |
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
| 4424 |
+
|
| 4425 |
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 4426 |
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
| 4427 |
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4428 |
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
| 4429 |
+
}
|
| 4430 |
+
} break;
|
| 4431 |
default:
|
| 4432 |
throw std::runtime_error("unknown architecture");
|
| 4433 |
}
|
|
|
|
| 6182 |
|
| 6183 |
attn_norm = llm_build_norm(ctx0, inpL, hparams,
|
| 6184 |
model.layers[il].attn_norm,
|
| 6185 |
+
model.layers[il].attn_norm_b,
|
| 6186 |
LLM_NORM, cb, il);
|
| 6187 |
cb(attn_norm, "attn_norm", il);
|
| 6188 |
|
|
|
|
| 6193 |
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
| 6194 |
cb(cur, "wqkv", il);
|
| 6195 |
|
| 6196 |
+
if (model.layers[il].bqkv){
|
| 6197 |
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
| 6198 |
+
cb(cur, "bqkv", il);
|
| 6199 |
+
}
|
| 6200 |
+
|
| 6201 |
if (hparams.f_clamp_kqv > 0.0f) {
|
| 6202 |
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
| 6203 |
cb(cur, "wqkv_clamped", il);
|
|
|
|
| 6214 |
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 6215 |
|
| 6216 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6217 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 6218 |
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6219 |
cb(cur, "kqv_out", il);
|
| 6220 |
}
|
|
|
|
| 6227 |
{
|
| 6228 |
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 6229 |
model.layers[il].ffn_norm,
|
| 6230 |
+
model.layers[il].ffn_norm_b,
|
| 6231 |
LLM_NORM, cb, il);
|
| 6232 |
cb(cur, "ffn_norm", il);
|
| 6233 |
cur = llm_build_ffn(ctx0, cur,
|
| 6234 |
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
| 6235 |
NULL, NULL,
|
| 6236 |
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
| 6237 |
model.layers[il].ffn_act,
|
| 6238 |
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
| 6239 |
cb(cur, "ffn_out", il);
|
|
|
|
| 6250 |
|
| 6251 |
cur = llm_build_norm(ctx0, cur, hparams,
|
| 6252 |
model.output_norm,
|
| 6253 |
+
model.output_norm_b,
|
| 6254 |
LLM_NORM, cb, -1);
|
| 6255 |
cb(cur, "result_norm", -1);
|
| 6256 |
|
|
|
|
| 7439 |
|
| 7440 |
return gf;
|
| 7441 |
}
|
| 7442 |
+
|
| 7443 |
+
struct ggml_cgraph * build_gemma() {
|
| 7444 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 7445 |
+
|
| 7446 |
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
| 7447 |
+
|
| 7448 |
+
struct ggml_tensor * cur;
|
| 7449 |
+
struct ggml_tensor * inpL;
|
| 7450 |
+
|
| 7451 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 7452 |
+
cb(inpL, "inp_embd", -1);
|
| 7453 |
+
|
| 7454 |
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
| 7455 |
+
cb(inpL, "inp_scaled", -1);
|
| 7456 |
+
|
| 7457 |
+
// inp_pos - contains the positions
|
| 7458 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 7459 |
+
cb(inp_pos, "inp_pos", -1);
|
| 7460 |
+
|
| 7461 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 7462 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 7463 |
+
cb(KQ_mask, "KQ_mask", -1);
|
| 7464 |
+
|
| 7465 |
+
// shift the entire K-cache if needed
|
| 7466 |
+
if (do_rope_shift) {
|
| 7467 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 7468 |
+
}
|
| 7469 |
+
|
| 7470 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 7471 |
+
|
| 7472 |
+
// norm
|
| 7473 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 7474 |
+
model.layers[il].attn_norm, NULL,
|
| 7475 |
+
LLM_NORM_RMS, cb, il);
|
| 7476 |
+
cb(cur, "attn_norm", il);
|
| 7477 |
+
|
| 7478 |
+
// self-attention
|
| 7479 |
+
{
|
| 7480 |
+
// compute Q and K and RoPE them
|
| 7481 |
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
| 7482 |
+
cb(Qcur, "Qcur", il);
|
| 7483 |
+
|
| 7484 |
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
| 7485 |
+
cb(Kcur, "Kcur", il);
|
| 7486 |
+
|
| 7487 |
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
| 7488 |
+
cb(Vcur, "Vcur", il);
|
| 7489 |
+
|
| 7490 |
+
Qcur = ggml_rope_custom(
|
| 7491 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
| 7492 |
+
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 7493 |
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
| 7494 |
+
cb(Qcur, "Qcur", il);
|
| 7495 |
+
|
| 7496 |
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
| 7497 |
+
cb(Qcur, "Qcur_scaled", il);
|
| 7498 |
+
|
| 7499 |
+
Kcur = ggml_rope_custom(
|
| 7500 |
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
| 7501 |
+
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 7502 |
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
| 7503 |
+
cb(Kcur, "Kcur", il);
|
| 7504 |
+
|
| 7505 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 7506 |
+
model.layers[il].wo, NULL,
|
| 7507 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
| 7508 |
+
cb(cur, "kqv_out", il);
|
| 7509 |
+
}
|
| 7510 |
+
|
| 7511 |
+
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
| 7512 |
+
cb(sa_out, "sa_out", il);
|
| 7513 |
+
|
| 7514 |
+
cur = llm_build_norm(ctx0, sa_out, hparams,
|
| 7515 |
+
model.layers[il].ffn_norm, NULL,
|
| 7516 |
+
LLM_NORM_RMS, cb, il);
|
| 7517 |
+
cb(cur, "ffn_norm", il);
|
| 7518 |
+
|
| 7519 |
+
// feed-forward network
|
| 7520 |
+
{
|
| 7521 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 7522 |
+
model.layers[il].ffn_up, NULL,
|
| 7523 |
+
model.layers[il].ffn_gate, NULL,
|
| 7524 |
+
model.layers[il].ffn_down, NULL,
|
| 7525 |
+
NULL,
|
| 7526 |
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
| 7527 |
+
cb(cur, "ffn_out", il);
|
| 7528 |
+
}
|
| 7529 |
+
|
| 7530 |
+
cur = ggml_add(ctx0, cur, sa_out);
|
| 7531 |
+
cb(cur, "l_out", il);
|
| 7532 |
+
|
| 7533 |
+
// input for next layer
|
| 7534 |
+
inpL = cur;
|
| 7535 |
+
}
|
| 7536 |
+
|
| 7537 |
+
cur = inpL;
|
| 7538 |
+
|
| 7539 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 7540 |
+
model.output_norm, NULL,
|
| 7541 |
+
LLM_NORM_RMS, cb, -1);
|
| 7542 |
+
cb(cur, "result_norm", -1);
|
| 7543 |
+
|
| 7544 |
+
// lm_head
|
| 7545 |
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
| 7546 |
+
cb(cur, "result_output", -1);
|
| 7547 |
+
|
| 7548 |
+
ggml_build_forward_expand(gf, cur);
|
| 7549 |
+
|
| 7550 |
+
return gf;
|
| 7551 |
+
}
|
| 7552 |
};
|
| 7553 |
|
| 7554 |
static struct ggml_cgraph * llama_build_graph(
|
|
|
|
| 7657 |
{
|
| 7658 |
result = llm.build_minicpm();
|
| 7659 |
} break;
|
| 7660 |
+
case LLM_ARCH_GEMMA:
|
| 7661 |
+
{
|
| 7662 |
+
result = llm.build_gemma();
|
| 7663 |
+
} break;
|
| 7664 |
default:
|
| 7665 |
GGML_ASSERT(false);
|
| 7666 |
}
|
|
|
|
| 10498 |
return std::make_pair(i_layer, n_layer);
|
| 10499 |
};
|
| 10500 |
|
| 10501 |
+
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
| 10502 |
+
// with the quantization of the output tensor
|
| 10503 |
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
|
| 10504 |
+
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
|
| 10505 |
int nx = tensor->ne[0];
|
| 10506 |
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
| 10507 |
new_type = GGML_TYPE_Q8_0;
|
|
|
|
| 10546 |
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 10547 |
}
|
| 10548 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
| 10549 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
|
| 10550 |
+
new_type = GGML_TYPE_Q5_K;
|
| 10551 |
+
}
|
| 10552 |
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
| 10553 |
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
| 10554 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
|
|
|
| 10601 |
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
| 10602 |
}
|
| 10603 |
}
|
| 10604 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
|
| 10605 |
+
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
| 10606 |
+
}
|
| 10607 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
| 10608 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
| 10609 |
new_type = GGML_TYPE_Q5_K;
|
|
|
|
| 10620 |
if (arch != LLM_ARCH_FALCON) {
|
| 10621 |
if (qs.model.hparams.n_expert == 8) {
|
| 10622 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
| 10623 |
+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
| 10624 |
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 10625 |
new_type = GGML_TYPE_Q5_K;
|
| 10626 |
}
|
|
|
|
| 10687 |
case GGML_TYPE_IQ2_XS:
|
| 10688 |
case GGML_TYPE_IQ3_XXS:
|
| 10689 |
case GGML_TYPE_IQ1_S:
|
| 10690 |
+
case GGML_TYPE_Q2_K:
|
| 10691 |
+
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
|
| 10692 |
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
| 10693 |
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
| 10694 |
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
|
|
|
| 10729 |
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
| 10730 |
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
| 10731 |
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
| 10732 |
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
| 10733 |
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
| 10734 |
|
| 10735 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 10736 |
}
|
|
|
|
| 12194 |
data_ctx->write(&kv_used, sizeof(kv_used));
|
| 12195 |
|
| 12196 |
if (kv_buf_size) {
|
|
|
|
|
|
|
| 12197 |
std::vector<uint8_t> tmp_buf;
|
| 12198 |
for (int il = 0; il < (int) n_layer; ++il) {
|
| 12199 |
+
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
| 12200 |
+
tmp_buf.resize(k_size);
|
| 12201 |
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
| 12202 |
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
| 12203 |
|
| 12204 |
// v is not contiguous, copy row by row
|
| 12205 |
+
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
| 12206 |
+
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
| 12207 |
+
tmp_buf.resize(v_row_size);
|
| 12208 |
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
| 12209 |
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
| 12210 |
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
| 12211 |
}
|
| 12212 |
}
|
|
|
|
| 12308 |
if (kv_buf_size) {
|
| 12309 |
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
| 12310 |
|
|
|
|
|
|
|
| 12311 |
for (int il = 0; il < (int) n_layer; ++il) {
|
| 12312 |
+
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
| 12313 |
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
| 12314 |
inp += k_size;
|
| 12315 |
|
| 12316 |
// v is not contiguous, copy row by row
|
| 12317 |
+
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
| 12318 |
+
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
| 12319 |
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
| 12320 |
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
| 12321 |
inp += v_row_size;
|
| 12322 |
}
|
| 12323 |
}
|
|
|
|
| 12779 |
if (add_ass) {
|
| 12780 |
ss << "<|assistant|>\n";
|
| 12781 |
}
|
| 12782 |
+
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
| 12783 |
+
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
| 12784 |
+
for (auto message : chat) {
|
| 12785 |
+
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
| 12786 |
+
ss << bos << message->role << "\n" << message->content << "</s>\n";
|
| 12787 |
+
}
|
| 12788 |
+
if (add_ass) {
|
| 12789 |
+
ss << "<s>assistant\n";
|
| 12790 |
+
}
|
| 12791 |
+
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
| 12792 |
+
// google/gemma-7b-it
|
| 12793 |
+
std::string system_prompt = "";
|
| 12794 |
+
for (auto message : chat) {
|
| 12795 |
+
std::string role(message->role);
|
| 12796 |
+
if (role == "system") {
|
| 12797 |
+
// there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
|
| 12798 |
+
system_prompt = trim(message->content);
|
| 12799 |
+
continue;
|
| 12800 |
+
}
|
| 12801 |
+
// in gemma, "assistant" is "model"
|
| 12802 |
+
role = role == "assistant" ? "model" : message->role;
|
| 12803 |
+
ss << "<start_of_turn>" << role << "\n";
|
| 12804 |
+
if (!system_prompt.empty() && role != "model") {
|
| 12805 |
+
ss << system_prompt << "\n\n";
|
| 12806 |
+
system_prompt = "";
|
| 12807 |
+
}
|
| 12808 |
+
ss << trim(message->content) << "<end_of_turn>\n";
|
| 12809 |
+
}
|
| 12810 |
+
if (add_ass) {
|
| 12811 |
+
ss << "<start_of_turn>model\n";
|
| 12812 |
+
}
|
| 12813 |
} else {
|
| 12814 |
// template not supported
|
| 12815 |
return -1;
|
|
|
|
| 12832 |
// load template from model
|
| 12833 |
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
| 12834 |
std::string template_key = "tokenizer.chat_template";
|
| 12835 |
+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
| 12836 |
if (res < 0) {
|
| 12837 |
// worst case: there is no information about template, we will use chatml by default
|
| 12838 |
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
examples/talk-llama/llama.h
CHANGED
|
@@ -101,6 +101,7 @@ extern "C" {
|
|
| 101 |
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
| 102 |
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
| 103 |
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
|
|
|
| 104 |
|
| 105 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 106 |
};
|
|
@@ -707,7 +708,7 @@ extern "C" {
|
|
| 707 |
|
| 708 |
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
| 709 |
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
| 710 |
-
/// NOTE: This function
|
| 711 |
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
| 712 |
/// @param chat Pointer to a list of multiple llama_chat_message
|
| 713 |
/// @param n_msg Number of llama_chat_message in this chat
|
|
|
|
| 101 |
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
| 102 |
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
| 103 |
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
| 104 |
+
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
| 105 |
|
| 106 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 107 |
};
|
|
|
|
| 708 |
|
| 709 |
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
| 710 |
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
| 711 |
+
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
| 712 |
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
| 713 |
/// @param chat Pointer to a list of multiple llama_chat_message
|
| 714 |
/// @param n_msg Number of llama_chat_message in this chat
|