ggerganov commited on
Commit
53d0282
·
unverified ·
1 Parent(s): 4f680fe

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -208,6 +208,7 @@ enum llm_arch {
208
  LLM_ARCH_ORION,
209
  LLM_ARCH_INTERNLM2,
210
  LLM_ARCH_MINICPM,
 
211
  LLM_ARCH_UNKNOWN,
212
  };
213
 
@@ -234,6 +235,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
234
  { LLM_ARCH_ORION, "orion" },
235
  { LLM_ARCH_INTERNLM2, "internlm2" },
236
  { LLM_ARCH_MINICPM, "minicpm" },
 
237
  };
238
 
239
  enum llm_kv {
@@ -760,6 +762,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
760
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
761
  },
762
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
763
  {
764
  LLM_ARCH_UNKNOWN,
765
  {
@@ -2527,6 +2545,7 @@ struct llama_model_loader {
2527
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2528
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2529
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
 
2530
  default:
2531
  {
2532
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2772,13 +2791,7 @@ struct llama_model_loader {
2772
 
2773
  std::vector<no_init<uint8_t>> read_buf;
2774
 
2775
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2776
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2777
- if (!cur) {
2778
- // some tensors may be allocated in a different context
2779
- continue;
2780
- }
2781
-
2782
  if (progress_callback) {
2783
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
2784
  return false;
@@ -2877,6 +2890,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2877
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2878
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2879
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
 
2880
 
2881
  default: return "unknown, may not work";
2882
  }
@@ -3241,6 +3255,16 @@ static void llm_load_hparams(
3241
  default: model.type = e_model::MODEL_UNKNOWN;
3242
  }
3243
  } break;
 
 
 
 
 
 
 
 
 
 
3244
  default: (void)0;
3245
  }
3246
 
@@ -3692,7 +3716,7 @@ static bool llm_load_tensors(
3692
  }
3693
 
3694
  // create one context per buffer type
3695
- size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
3696
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
3697
  for (auto & it : buft_layer_count) {
3698
  struct ggml_init_params params = {
@@ -3830,6 +3854,7 @@ static bool llm_load_tensors(
3830
  } else {
3831
  model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
3832
  ml.n_created--; // artificial tensor
 
3833
  }
3834
  }
3835
 
@@ -4029,6 +4054,8 @@ static bool llm_load_tensors(
4029
  // output
4030
  {
4031
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
 
 
4032
  model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4033
  }
4034
 
@@ -4038,14 +4065,23 @@ static bool llm_load_tensors(
4038
 
4039
  auto & layer = model.layers[i];
4040
 
4041
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
4042
 
4043
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
 
 
4044
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
4045
 
4046
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4047
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4048
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
 
 
 
 
 
4049
 
4050
  // AWQ ScaleActivation layer
4051
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@@ -4358,6 +4394,40 @@ static bool llm_load_tensors(
4358
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4359
  }
4360
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4361
  default:
4362
  throw std::runtime_error("unknown architecture");
4363
  }
@@ -6112,7 +6182,7 @@ struct llm_build_context {
6112
 
6113
  attn_norm = llm_build_norm(ctx0, inpL, hparams,
6114
  model.layers[il].attn_norm,
6115
- NULL,
6116
  LLM_NORM, cb, il);
6117
  cb(attn_norm, "attn_norm", il);
6118
 
@@ -6123,6 +6193,11 @@ struct llm_build_context {
6123
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6124
  cb(cur, "wqkv", il);
6125
 
 
 
 
 
 
6126
  if (hparams.f_clamp_kqv > 0.0f) {
6127
  cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
6128
  cb(cur, "wqkv_clamped", il);
@@ -6139,7 +6214,7 @@ struct llm_build_context {
6139
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6140
 
6141
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6142
- model.layers[il].wo, NULL,
6143
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6144
  cb(cur, "kqv_out", il);
6145
  }
@@ -6152,13 +6227,13 @@ struct llm_build_context {
6152
  {
6153
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
6154
  model.layers[il].ffn_norm,
6155
- NULL,
6156
  LLM_NORM, cb, il);
6157
  cb(cur, "ffn_norm", il);
6158
  cur = llm_build_ffn(ctx0, cur,
6159
- model.layers[il].ffn_up, NULL,
6160
  NULL, NULL,
6161
- model.layers[il].ffn_down, NULL,
6162
  model.layers[il].ffn_act,
6163
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6164
  cb(cur, "ffn_out", il);
@@ -6175,7 +6250,7 @@ struct llm_build_context {
6175
 
6176
  cur = llm_build_norm(ctx0, cur, hparams,
6177
  model.output_norm,
6178
- NULL,
6179
  LLM_NORM, cb, -1);
6180
  cb(cur, "result_norm", -1);
6181
 
@@ -7364,6 +7439,116 @@ struct llm_build_context {
7364
 
7365
  return gf;
7366
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7367
  };
7368
 
7369
  static struct ggml_cgraph * llama_build_graph(
@@ -7472,6 +7657,10 @@ static struct ggml_cgraph * llama_build_graph(
7472
  {
7473
  result = llm.build_minicpm();
7474
  } break;
 
 
 
 
7475
  default:
7476
  GGML_ASSERT(false);
7477
  }
@@ -10309,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10309
  return std::make_pair(i_layer, n_layer);
10310
  };
10311
 
10312
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
 
 
 
10313
  int nx = tensor->ne[0];
10314
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10315
  new_type = GGML_TYPE_Q8_0;
@@ -10354,6 +10546,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10354
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10355
  }
10356
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
 
 
 
10357
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
10358
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
10359
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
@@ -10406,6 +10601,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10406
  if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10407
  }
10408
  }
 
 
 
10409
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10410
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
10411
  new_type = GGML_TYPE_Q5_K;
@@ -10422,7 +10620,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10422
  if (arch != LLM_ARCH_FALCON) {
10423
  if (qs.model.hparams.n_expert == 8) {
10424
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10425
- ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
10426
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
10427
  new_type = GGML_TYPE_Q5_K;
10428
  }
@@ -10489,8 +10687,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10489
  case GGML_TYPE_IQ2_XS:
10490
  case GGML_TYPE_IQ3_XXS:
10491
  case GGML_TYPE_IQ1_S:
10492
- case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
10493
- case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
10494
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10495
  case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10496
  case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
@@ -10531,7 +10729,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10531
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10532
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10533
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10534
- case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break;
 
10535
 
10536
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10537
  }
@@ -11995,18 +12194,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
11995
  data_ctx->write(&kv_used, sizeof(kv_used));
11996
 
11997
  if (kv_buf_size) {
11998
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
11999
-
12000
  std::vector<uint8_t> tmp_buf;
12001
  for (int il = 0; il < (int) n_layer; ++il) {
12002
- tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
 
12003
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
12004
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
12005
 
12006
  // v is not contiguous, copy row by row
12007
- tmp_buf.resize(elt_size*kv_head);
 
 
12008
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12009
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
12010
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
12011
  }
12012
  }
@@ -12108,17 +12308,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12108
  if (kv_buf_size) {
12109
  GGML_ASSERT(kv_self.total_size() == kv_buf_size);
12110
 
12111
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
12112
-
12113
  for (int il = 0; il < (int) n_layer; ++il) {
12114
- size_t k_size = elt_size*n_embd_k_gqa*kv_head;
12115
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
12116
  inp += k_size;
12117
 
12118
  // v is not contiguous, copy row by row
12119
- size_t v_row_size = elt_size*kv_head;
 
12120
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12121
- ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
12122
  inp += v_row_size;
12123
  }
12124
  }
@@ -12580,6 +12779,37 @@ static int32_t llama_chat_apply_template_internal(
12580
  if (add_ass) {
12581
  ss << "<|assistant|>\n";
12582
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12583
  } else {
12584
  // template not supported
12585
  return -1;
@@ -12602,7 +12832,7 @@ LLAMA_API int32_t llama_chat_apply_template(
12602
  // load template from model
12603
  std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
12604
  std::string template_key = "tokenizer.chat_template";
12605
- int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
12606
  if (res < 0) {
12607
  // worst case: there is no information about template, we will use chatml by default
12608
  curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
 
208
  LLM_ARCH_ORION,
209
  LLM_ARCH_INTERNLM2,
210
  LLM_ARCH_MINICPM,
211
+ LLM_ARCH_GEMMA,
212
  LLM_ARCH_UNKNOWN,
213
  };
214
 
 
235
  { LLM_ARCH_ORION, "orion" },
236
  { LLM_ARCH_INTERNLM2, "internlm2" },
237
  { LLM_ARCH_MINICPM, "minicpm" },
238
+ { LLM_ARCH_GEMMA, "gemma" },
239
  };
240
 
241
  enum llm_kv {
 
762
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
763
  },
764
  },
765
+ {
766
+ LLM_ARCH_GEMMA,
767
+ {
768
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
769
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
770
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
771
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
772
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
773
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
774
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
775
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
776
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
777
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
778
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
779
+ },
780
+ },
781
  {
782
  LLM_ARCH_UNKNOWN,
783
  {
 
2545
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2546
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2547
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2548
+ case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2549
  default:
2550
  {
2551
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
 
2791
 
2792
  std::vector<no_init<uint8_t>> read_buf;
2793
 
2794
+ for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
 
 
 
 
 
 
2795
  if (progress_callback) {
2796
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
2797
  return false;
 
2890
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2891
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2892
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2893
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
2894
 
2895
  default: return "unknown, may not work";
2896
  }
 
3255
  default: model.type = e_model::MODEL_UNKNOWN;
3256
  }
3257
  } break;
3258
+ case LLM_ARCH_GEMMA:
3259
+ {
3260
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3261
+
3262
+ switch (hparams.n_layer) {
3263
+ case 18: model.type = e_model::MODEL_2B; break;
3264
+ case 28: model.type = e_model::MODEL_7B; break;
3265
+ default: model.type = e_model::MODEL_UNKNOWN;
3266
+ }
3267
+ } break;
3268
  default: (void)0;
3269
  }
3270
 
 
3716
  }
3717
 
3718
  // create one context per buffer type
3719
+ size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
3720
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
3721
  for (auto & it : buft_layer_count) {
3722
  struct ggml_init_params params = {
 
3854
  } else {
3855
  model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
3856
  ml.n_created--; // artificial tensor
3857
+ ml.size_data += ggml_nbytes(model.output);
3858
  }
3859
  }
3860
 
 
4054
  // output
4055
  {
4056
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4057
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
4058
+
4059
  model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4060
  }
4061
 
 
4065
 
4066
  auto & layer = model.layers[i];
4067
 
4068
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4069
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
4070
 
4071
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4072
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
4073
+
4074
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4075
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
4076
 
4077
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4078
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4079
+
4080
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
4081
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
4082
+
4083
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4084
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
4085
 
4086
  // AWQ ScaleActivation layer
4087
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
 
4394
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4395
  }
4396
  } break;
4397
+ case LLM_ARCH_GEMMA:
4398
+ {
4399
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4400
+
4401
+ // output
4402
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4403
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
4404
+ ml.n_created--; // artificial tensor
4405
+ ml.size_data += ggml_nbytes(model.output);
4406
+
4407
+ const int64_t n_ff = hparams.n_ff;
4408
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4409
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4410
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4411
+
4412
+ for (uint32_t i = 0; i < n_layer; ++i) {
4413
+ ggml_context * ctx_layer = ctx_for_layer(i);
4414
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4415
+
4416
+ auto & layer = model.layers[i];
4417
+
4418
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4419
+
4420
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
4421
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
4422
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
4423
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
4424
+
4425
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4426
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4427
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4428
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4429
+ }
4430
+ } break;
4431
  default:
4432
  throw std::runtime_error("unknown architecture");
4433
  }
 
6182
 
6183
  attn_norm = llm_build_norm(ctx0, inpL, hparams,
6184
  model.layers[il].attn_norm,
6185
+ model.layers[il].attn_norm_b,
6186
  LLM_NORM, cb, il);
6187
  cb(attn_norm, "attn_norm", il);
6188
 
 
6193
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6194
  cb(cur, "wqkv", il);
6195
 
6196
+ if (model.layers[il].bqkv){
6197
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6198
+ cb(cur, "bqkv", il);
6199
+ }
6200
+
6201
  if (hparams.f_clamp_kqv > 0.0f) {
6202
  cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
6203
  cb(cur, "wqkv_clamped", il);
 
6214
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6215
 
6216
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6217
+ model.layers[il].wo, model.layers[il].bo,
6218
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6219
  cb(cur, "kqv_out", il);
6220
  }
 
6227
  {
6228
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
6229
  model.layers[il].ffn_norm,
6230
+ model.layers[il].ffn_norm_b,
6231
  LLM_NORM, cb, il);
6232
  cb(cur, "ffn_norm", il);
6233
  cur = llm_build_ffn(ctx0, cur,
6234
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6235
  NULL, NULL,
6236
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6237
  model.layers[il].ffn_act,
6238
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6239
  cb(cur, "ffn_out", il);
 
6250
 
6251
  cur = llm_build_norm(ctx0, cur, hparams,
6252
  model.output_norm,
6253
+ model.output_norm_b,
6254
  LLM_NORM, cb, -1);
6255
  cb(cur, "result_norm", -1);
6256
 
 
7439
 
7440
  return gf;
7441
  }
7442
+
7443
+ struct ggml_cgraph * build_gemma() {
7444
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7445
+
7446
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
7447
+
7448
+ struct ggml_tensor * cur;
7449
+ struct ggml_tensor * inpL;
7450
+
7451
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7452
+ cb(inpL, "inp_embd", -1);
7453
+
7454
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
7455
+ cb(inpL, "inp_scaled", -1);
7456
+
7457
+ // inp_pos - contains the positions
7458
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7459
+ cb(inp_pos, "inp_pos", -1);
7460
+
7461
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7462
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7463
+ cb(KQ_mask, "KQ_mask", -1);
7464
+
7465
+ // shift the entire K-cache if needed
7466
+ if (do_rope_shift) {
7467
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7468
+ }
7469
+
7470
+ for (int il = 0; il < n_layer; ++il) {
7471
+
7472
+ // norm
7473
+ cur = llm_build_norm(ctx0, inpL, hparams,
7474
+ model.layers[il].attn_norm, NULL,
7475
+ LLM_NORM_RMS, cb, il);
7476
+ cb(cur, "attn_norm", il);
7477
+
7478
+ // self-attention
7479
+ {
7480
+ // compute Q and K and RoPE them
7481
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
7482
+ cb(Qcur, "Qcur", il);
7483
+
7484
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
7485
+ cb(Kcur, "Kcur", il);
7486
+
7487
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7488
+ cb(Vcur, "Vcur", il);
7489
+
7490
+ Qcur = ggml_rope_custom(
7491
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
7492
+ n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7493
+ ext_factor, attn_factor, beta_fast, beta_slow);
7494
+ cb(Qcur, "Qcur", il);
7495
+
7496
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
7497
+ cb(Qcur, "Qcur_scaled", il);
7498
+
7499
+ Kcur = ggml_rope_custom(
7500
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
7501
+ n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7502
+ ext_factor, attn_factor, beta_fast, beta_slow);
7503
+ cb(Kcur, "Kcur", il);
7504
+
7505
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7506
+ model.layers[il].wo, NULL,
7507
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7508
+ cb(cur, "kqv_out", il);
7509
+ }
7510
+
7511
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
7512
+ cb(sa_out, "sa_out", il);
7513
+
7514
+ cur = llm_build_norm(ctx0, sa_out, hparams,
7515
+ model.layers[il].ffn_norm, NULL,
7516
+ LLM_NORM_RMS, cb, il);
7517
+ cb(cur, "ffn_norm", il);
7518
+
7519
+ // feed-forward network
7520
+ {
7521
+ cur = llm_build_ffn(ctx0, cur,
7522
+ model.layers[il].ffn_up, NULL,
7523
+ model.layers[il].ffn_gate, NULL,
7524
+ model.layers[il].ffn_down, NULL,
7525
+ NULL,
7526
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
7527
+ cb(cur, "ffn_out", il);
7528
+ }
7529
+
7530
+ cur = ggml_add(ctx0, cur, sa_out);
7531
+ cb(cur, "l_out", il);
7532
+
7533
+ // input for next layer
7534
+ inpL = cur;
7535
+ }
7536
+
7537
+ cur = inpL;
7538
+
7539
+ cur = llm_build_norm(ctx0, cur, hparams,
7540
+ model.output_norm, NULL,
7541
+ LLM_NORM_RMS, cb, -1);
7542
+ cb(cur, "result_norm", -1);
7543
+
7544
+ // lm_head
7545
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7546
+ cb(cur, "result_output", -1);
7547
+
7548
+ ggml_build_forward_expand(gf, cur);
7549
+
7550
+ return gf;
7551
+ }
7552
  };
7553
 
7554
  static struct ggml_cgraph * llama_build_graph(
 
7657
  {
7658
  result = llm.build_minicpm();
7659
  } break;
7660
+ case LLM_ARCH_GEMMA:
7661
+ {
7662
+ result = llm.build_gemma();
7663
+ } break;
7664
  default:
7665
  GGML_ASSERT(false);
7666
  }
 
10498
  return std::make_pair(i_layer, n_layer);
10499
  };
10500
 
10501
+ // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
10502
+ // with the quantization of the output tensor
10503
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
10504
+ (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
10505
  int nx = tensor->ne[0];
10506
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10507
  new_type = GGML_TYPE_Q8_0;
 
10546
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10547
  }
10548
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10549
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
10550
+ new_type = GGML_TYPE_Q5_K;
10551
+ }
10552
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
10553
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
10554
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
 
10601
  if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10602
  }
10603
  }
10604
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
10605
+ if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
10606
+ }
10607
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10608
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
10609
  new_type = GGML_TYPE_Q5_K;
 
10620
  if (arch != LLM_ARCH_FALCON) {
10621
  if (qs.model.hparams.n_expert == 8) {
10622
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10623
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
10624
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
10625
  new_type = GGML_TYPE_Q5_K;
10626
  }
 
10687
  case GGML_TYPE_IQ2_XS:
10688
  case GGML_TYPE_IQ3_XXS:
10689
  case GGML_TYPE_IQ1_S:
10690
+ case GGML_TYPE_Q2_K:
10691
+ case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
10692
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10693
  case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10694
  case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
 
10729
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10730
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10731
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10732
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10733
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
10734
 
10735
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10736
  }
 
12194
  data_ctx->write(&kv_used, sizeof(kv_used));
12195
 
12196
  if (kv_buf_size) {
 
 
12197
  std::vector<uint8_t> tmp_buf;
12198
  for (int il = 0; il < (int) n_layer; ++il) {
12199
+ size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12200
+ tmp_buf.resize(k_size);
12201
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
12202
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
12203
 
12204
  // v is not contiguous, copy row by row
12205
+ size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12206
+ size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12207
+ tmp_buf.resize(v_row_size);
12208
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12209
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
12210
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
12211
  }
12212
  }
 
12308
  if (kv_buf_size) {
12309
  GGML_ASSERT(kv_self.total_size() == kv_buf_size);
12310
 
 
 
12311
  for (int il = 0; il < (int) n_layer; ++il) {
12312
+ size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12313
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
12314
  inp += k_size;
12315
 
12316
  // v is not contiguous, copy row by row
12317
+ size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12318
+ size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12319
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12320
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
12321
  inp += v_row_size;
12322
  }
12323
  }
 
12779
  if (add_ass) {
12780
  ss << "<|assistant|>\n";
12781
  }
12782
+ } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
12783
+ // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
12784
+ for (auto message : chat) {
12785
+ std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
12786
+ ss << bos << message->role << "\n" << message->content << "</s>\n";
12787
+ }
12788
+ if (add_ass) {
12789
+ ss << "<s>assistant\n";
12790
+ }
12791
+ } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
12792
+ // google/gemma-7b-it
12793
+ std::string system_prompt = "";
12794
+ for (auto message : chat) {
12795
+ std::string role(message->role);
12796
+ if (role == "system") {
12797
+ // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
12798
+ system_prompt = trim(message->content);
12799
+ continue;
12800
+ }
12801
+ // in gemma, "assistant" is "model"
12802
+ role = role == "assistant" ? "model" : message->role;
12803
+ ss << "<start_of_turn>" << role << "\n";
12804
+ if (!system_prompt.empty() && role != "model") {
12805
+ ss << system_prompt << "\n\n";
12806
+ system_prompt = "";
12807
+ }
12808
+ ss << trim(message->content) << "<end_of_turn>\n";
12809
+ }
12810
+ if (add_ass) {
12811
+ ss << "<start_of_turn>model\n";
12812
+ }
12813
  } else {
12814
  // template not supported
12815
  return -1;
 
12832
  // load template from model
12833
  std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
12834
  std::string template_key = "tokenizer.chat_template";
12835
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
12836
  if (res < 0) {
12837
  // worst case: there is no information about template, we will use chatml by default
12838
  curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
examples/talk-llama/llama.h CHANGED
@@ -101,6 +101,7 @@ extern "C" {
101
  LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
102
  LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
103
  LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
 
104
 
105
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
106
  };
@@ -707,7 +708,7 @@ extern "C" {
707
 
708
  /// Apply chat template. Inspired by hf apply_chat_template() on python.
709
  /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
710
- /// NOTE: This function only support some known jinja templates. It is not a jinja parser.
711
  /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
712
  /// @param chat Pointer to a list of multiple llama_chat_message
713
  /// @param n_msg Number of llama_chat_message in this chat
 
101
  LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
102
  LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
103
  LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
104
+ LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
105
 
106
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
107
  };
 
708
 
709
  /// Apply chat template. Inspired by hf apply_chat_template() on python.
710
  /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
711
+ /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
712
  /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
713
  /// @param chat Pointer to a list of multiple llama_chat_message
714
  /// @param n_msg Number of llama_chat_message in this chat