ggerganov commited on
Commit
92cfd93
·
unverified ·
1 Parent(s): 5a9540e

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -192,8 +192,10 @@ enum llm_arch {
192
  LLM_ARCH_BLOOM,
193
  LLM_ARCH_STABLELM,
194
  LLM_ARCH_QWEN,
 
195
  LLM_ARCH_PHI2,
196
  LLM_ARCH_PLAMO,
 
197
  LLM_ARCH_UNKNOWN,
198
  };
199
 
@@ -211,8 +213,10 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
211
  { LLM_ARCH_BLOOM, "bloom" },
212
  { LLM_ARCH_STABLELM, "stablelm" },
213
  { LLM_ARCH_QWEN, "qwen" },
 
214
  { LLM_ARCH_PHI2, "phi2" },
215
  { LLM_ARCH_PLAMO, "plamo" },
 
216
  };
217
 
218
  enum llm_kv {
@@ -566,6 +570,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
566
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
567
  },
568
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
  {
570
  LLM_ARCH_PHI2,
571
  {
@@ -600,6 +621,26 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
600
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
601
  },
602
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
 
604
  {
605
  LLM_ARCH_UNKNOWN,
@@ -1284,8 +1325,10 @@ static llama_state g_state;
1284
  // available llama models
1285
  enum e_model {
1286
  MODEL_UNKNOWN,
 
1287
  MODEL_1B,
1288
  MODEL_3B,
 
1289
  MODEL_7B,
1290
  MODEL_8B,
1291
  MODEL_13B,
@@ -1599,7 +1642,7 @@ struct llama_model {
1599
  std::unique_ptr<llama_mmap> mapping;
1600
 
1601
  // objects representing data potentially being locked in memory
1602
- llama_mlock mlock_buf;
1603
  llama_mlock mlock_mmap;
1604
 
1605
  // for quantize-stats only
@@ -1626,6 +1669,9 @@ struct llama_context {
1626
  for (ggml_backend_t backend : backends) {
1627
  ggml_backend_free(backend);
1628
  }
 
 
 
1629
  }
1630
 
1631
  llama_cparams cparams;
@@ -1672,8 +1718,14 @@ struct llama_context {
1672
  // allocator for the input tensors
1673
  ggml_tallocr * alloc = nullptr;
1674
 
1675
- // temporary buffer for copying data to/from the backend
1676
- std::vector<no_init<uint8_t>> buf_copy;
 
 
 
 
 
 
1677
 
1678
  #ifdef GGML_USE_MPI
1679
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2257,18 +2309,18 @@ struct llama_model_loader {
2257
  }
2258
 
2259
  switch (type_max) {
2260
- case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
2261
- case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
2262
- case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
2263
- case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
2264
- case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
2265
- case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
2266
- case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
2267
- case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
2268
- case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
2269
- case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
2270
- case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
2271
- case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2272
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2273
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2274
  default:
@@ -2618,6 +2670,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2618
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2619
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2620
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
 
2621
 
2622
  default: return "unknown, may not work";
2623
  }
@@ -2833,6 +2886,7 @@ static void llm_load_hparams(
2833
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2834
 
2835
  switch (hparams.n_layer) {
 
2836
  case 32: model.type = e_model::MODEL_3B; break;
2837
  default: model.type = e_model::MODEL_UNKNOWN;
2838
  }
@@ -2847,6 +2901,17 @@ static void llm_load_hparams(
2847
  default: model.type = e_model::MODEL_UNKNOWN;
2848
  }
2849
  } break;
 
 
 
 
 
 
 
 
 
 
 
2850
  case LLM_ARCH_PHI2:
2851
  {
2852
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2877,6 +2942,14 @@ static void llm_load_hparams(
2877
  default: model.type = e_model::MODEL_UNKNOWN;
2878
  }
2879
  } break;
 
 
 
 
 
 
 
 
2880
 
2881
  default: (void)0;
2882
  }
@@ -3438,7 +3511,12 @@ static bool llm_load_tensors(
3438
  {
3439
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3440
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3441
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
 
 
 
 
 
3442
  }
3443
 
3444
  for (int i = 0; i < n_layer; ++i) {
@@ -3632,6 +3710,11 @@ static bool llm_load_tensors(
3632
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3633
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3634
 
 
 
 
 
 
3635
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3636
  layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3637
 
@@ -3669,6 +3752,41 @@ static bool llm_load_tensors(
3669
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
3670
  }
3671
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3672
  case LLM_ARCH_PHI2:
3673
  {
3674
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -3779,6 +3897,42 @@ static bool llm_load_tensors(
3779
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3780
  }
3781
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3782
  default:
3783
  throw std::runtime_error("unknown architecture");
3784
  }
@@ -3815,8 +3969,10 @@ static bool llm_load_tensors(
3815
  else {
3816
  buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
3817
  if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
3818
- model.mlock_buf.init (ggml_backend_buffer_get_base(buf));
3819
- model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf));
 
 
3820
  }
3821
  }
3822
  if (buf == nullptr) {
@@ -3942,22 +4098,24 @@ static struct ggml_tensor * llm_build_inp_embd(
3942
  const llama_hparams & hparams,
3943
  const llama_batch & batch,
3944
  struct ggml_tensor * tok_embd,
 
 
3945
  const llm_build_cb & cb) {
3946
  const int64_t n_embd = hparams.n_embd;
3947
 
3948
  struct ggml_tensor * inpL;
3949
 
3950
  if (batch.token) {
3951
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
3952
  cb(inp_tokens, "inp_tokens", -1);
3953
 
3954
- inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
3955
  } else {
3956
  #ifdef GGML_USE_MPI
3957
  GGML_ASSERT(false && "not implemented");
3958
  #endif
3959
 
3960
- inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
3961
  }
3962
 
3963
  return inpL;
@@ -3971,6 +4129,7 @@ static void llm_build_k_shift(
3971
  const llama_cparams & cparams,
3972
  const llama_kv_cache & kv,
3973
  struct ggml_cgraph * graph,
 
3974
  llm_rope_type type,
3975
  int64_t n_ctx,
3976
  float freq_base,
@@ -3987,9 +4146,6 @@ static void llm_build_k_shift(
3987
  const float beta_fast = cparams.yarn_beta_fast;
3988
  const float beta_slow = cparams.yarn_beta_slow;
3989
 
3990
- struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
3991
- cb(K_shift, "K_shift", -1);
3992
-
3993
  int rope_type = 0;
3994
 
3995
  switch (type) {
@@ -4177,6 +4333,7 @@ static struct ggml_tensor * llm_build_kqv(
4177
  const llama_model & model,
4178
  const llama_hparams & hparams,
4179
  const llama_kv_cache & kv,
 
4180
  struct ggml_tensor * wo,
4181
  struct ggml_tensor * wo_b,
4182
  struct ggml_tensor * q_cur,
@@ -4255,6 +4412,8 @@ static struct ggml_tensor * llm_build_kqv(
4255
  struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
4256
  cb(cur, "kqv_merged_cont", il);
4257
 
 
 
4258
  cur = ggml_mul_mat(ctx, wo, cur);
4259
  if (wo_b) {
4260
  cb(cur, "kqv_wo", il);
@@ -4267,8 +4426,47 @@ static struct ggml_tensor * llm_build_kqv(
4267
  return cur;
4268
  }
4269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4270
  struct llm_build_context {
4271
  const llama_model & model;
 
4272
  const llama_hparams & hparams;
4273
  const llama_cparams & cparams;
4274
  const llama_batch & batch;
@@ -4315,6 +4513,7 @@ struct llm_build_context {
4315
  const llm_build_cb & cb,
4316
  bool worst_case) :
4317
  model (lctx.model),
 
4318
  hparams (model.hparams),
4319
  cparams (lctx.cparams),
4320
  batch (batch),
@@ -4375,20 +4574,20 @@ struct llm_build_context {
4375
  struct ggml_tensor * cur;
4376
  struct ggml_tensor * inpL;
4377
 
4378
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4379
  cb(inpL, "inp_embd", -1);
4380
 
4381
  // inp_pos - contains the positions
4382
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4383
  cb(inp_pos, "inp_pos", -1);
4384
 
4385
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4386
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4387
  cb(KQ_mask, "KQ_mask", -1);
4388
 
4389
  // shift the entire K-cache if needed
4390
  if (do_rope_shift) {
4391
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4392
  }
4393
 
4394
  for (int il = 0; il < n_layer; ++il) {
@@ -4424,12 +4623,6 @@ struct llm_build_context {
4424
  cb(Vcur, "Vcur", il);
4425
  }
4426
 
4427
- // these nodes are added to the graph together so that they are not reordered
4428
- // by doing so, the number of splits in the graph is reduced
4429
- ggml_build_forward_expand(gf, Qcur);
4430
- ggml_build_forward_expand(gf, Kcur);
4431
- ggml_build_forward_expand(gf, Vcur);
4432
-
4433
  Qcur = ggml_rope_custom(
4434
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4435
  hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
@@ -4444,11 +4637,9 @@ struct llm_build_context {
4444
  );
4445
  cb(Kcur, "Kcur", il);
4446
 
4447
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4448
-
4449
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4450
  model.layers[il].wo, model.layers[il].bo,
4451
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4452
  cb(cur, "kqv_out", il);
4453
  }
4454
 
@@ -4567,20 +4758,20 @@ struct llm_build_context {
4567
  struct ggml_tensor * cur;
4568
  struct ggml_tensor * inpL;
4569
 
4570
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4571
  cb(inpL, "inp_embd", -1);
4572
 
4573
  // inp_pos - contains the positions
4574
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4575
  cb(inp_pos, "inp_pos", -1);
4576
 
4577
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4578
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4579
  cb(KQ_mask, "KQ_mask", -1);
4580
 
4581
  // shift the entire K-cache if needed
4582
  if (do_rope_shift) {
4583
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4584
  }
4585
 
4586
  for (int il = 0; il < n_layer; ++il) {
@@ -4625,14 +4816,13 @@ struct llm_build_context {
4625
  cb(Qcur, "Qcur", il);
4626
  cb(Kcur, "Kcur", il);
4627
 
4628
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4629
 
4630
  // apply ALiBi for 13B model
4631
  const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
4632
 
4633
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4634
  model.layers[il].wo, NULL,
4635
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4636
  cb(cur, "kqv_out", il);
4637
  }
4638
 
@@ -4689,20 +4879,20 @@ struct llm_build_context {
4689
  struct ggml_tensor * cur;
4690
  struct ggml_tensor * inpL;
4691
 
4692
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4693
  cb(inpL, "inp_embd", -1);
4694
 
4695
  // inp_pos - contains the positions
4696
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4697
  cb(inp_pos, "inp_pos", -1);
4698
 
4699
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4700
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4701
  cb(KQ_mask, "KQ_mask", -1);
4702
 
4703
  // shift the entire K-cache if needed
4704
  if (do_rope_shift) {
4705
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
4706
  }
4707
 
4708
  for (int il = 0; il < n_layer; ++il) {
@@ -4754,11 +4944,9 @@ struct llm_build_context {
4754
  );
4755
  cb(Kcur, "Kcur", il);
4756
 
4757
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4758
-
4759
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4760
  model.layers[il].wo, NULL,
4761
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4762
  cb(cur, "kqv_out", il);
4763
  }
4764
 
@@ -4813,15 +5001,15 @@ struct llm_build_context {
4813
  struct ggml_tensor * pos;
4814
  struct ggml_tensor * inpL;
4815
 
4816
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4817
  cb(inpL, "inp_embd", -1);
4818
 
4819
  // inp_pos - contains the positions
4820
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4821
  cb(inp_pos, "inp_pos", -1);
4822
 
4823
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4824
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4825
  cb(KQ_mask, "KQ_mask", -1);
4826
 
4827
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
@@ -4855,11 +5043,9 @@ struct llm_build_context {
4855
 
4856
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
4857
 
4858
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4859
-
4860
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4861
  model.layers[il].wo, model.layers[il].bo,
4862
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4863
  cb(cur, "kqv_out", il);
4864
  }
4865
 
@@ -4912,19 +5098,19 @@ struct llm_build_context {
4912
  struct ggml_tensor * cur;
4913
  struct ggml_tensor * inpL;
4914
 
4915
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4916
  cb(inpL, "inp_embd", -1);
4917
 
4918
  // inp_pos - contains the positions
4919
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4920
  cb(inp_pos, "inp_pos", -1);
4921
 
4922
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4923
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4924
  cb(KQ_mask, "KQ_mask", -1);
4925
 
4926
  if (do_rope_shift) {
4927
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
4928
  }
4929
 
4930
  for (int il = 0; il < n_layer; ++il) {
@@ -5062,12 +5248,9 @@ struct llm_build_context {
5062
  );
5063
  cb(Vcur, "Vcur", il);
5064
 
5065
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5066
-
5067
- // TODO: not tested, could be broken
5068
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5069
  model.layers[il].wo, model.layers[il].bo,
5070
- Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5071
  cb(cur, "kqv_out", il);
5072
  }
5073
 
@@ -5122,11 +5305,11 @@ struct llm_build_context {
5122
  struct ggml_tensor * cur;
5123
  struct ggml_tensor * inpL;
5124
 
5125
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5126
  cb(inpL, "inp_embd", -1);
5127
 
5128
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5129
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5130
  cb(KQ_mask, "KQ_mask", -1);
5131
 
5132
  for (int il = 0; il < n_layer; ++il) {
@@ -5154,11 +5337,9 @@ struct llm_build_context {
5154
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5155
  cb(Qcur, "Qcur", il);
5156
 
5157
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5158
-
5159
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5160
  model.layers[il].wo, NULL,
5161
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5162
  cb(cur, "kqv_out", il);
5163
  }
5164
 
@@ -5214,11 +5395,11 @@ struct llm_build_context {
5214
  struct ggml_tensor * cur;
5215
  struct ggml_tensor * inpL;
5216
 
5217
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5218
  cb(inpL, "inp_embd", -1);
5219
 
5220
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5221
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5222
  cb(KQ_mask, "KQ_mask", -1);
5223
 
5224
  inpL = llm_build_norm(ctx0, inpL, hparams,
@@ -5252,11 +5433,9 @@ struct llm_build_context {
5252
 
5253
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5254
 
5255
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5256
-
5257
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5258
  model.layers[il].wo, model.layers[il].bo,
5259
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5260
  cb(cur, "kqv_out", il);
5261
  }
5262
 
@@ -5309,11 +5488,11 @@ struct llm_build_context {
5309
  struct ggml_tensor * cur;
5310
  struct ggml_tensor * inpL;
5311
 
5312
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5313
  cb(inpL, "inp_embd", -1);
5314
 
5315
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5316
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5317
  cb(KQ_mask, "KQ_mask", -1);
5318
 
5319
  for (int il = 0; il < n_layer; ++il) {
@@ -5347,11 +5526,9 @@ struct llm_build_context {
5347
 
5348
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5349
 
5350
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5351
-
5352
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5353
  model.layers[il].wo, NULL,
5354
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5355
  cb(cur, "kqv_out", il);
5356
  }
5357
 
@@ -5407,20 +5584,20 @@ struct llm_build_context {
5407
  struct ggml_tensor * cur;
5408
  struct ggml_tensor * inpL;
5409
 
5410
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5411
  cb(inpL, "inp_embd", -1);
5412
 
5413
  // inp_pos - contains the positions
5414
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5415
  cb(inp_pos, "inp_pos", -1);
5416
 
5417
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5418
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5419
  cb(KQ_mask, "KQ_mask", -1);
5420
 
5421
  // shift the entire K-cache if needed
5422
  if (do_rope_shift) {
5423
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5424
  }
5425
 
5426
  for (int il = 0; il < n_layer; ++il) {
@@ -5438,12 +5615,24 @@ struct llm_build_context {
5438
  // compute Q and K and RoPE them
5439
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
5440
  cb(Qcur, "Qcur", il);
 
 
 
 
5441
 
5442
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
5443
  cb(Kcur, "Kcur", il);
 
 
 
 
5444
 
5445
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5446
  cb(Vcur, "Vcur", il);
 
 
 
 
5447
 
5448
  Qcur = ggml_rope_custom(
5449
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@@ -5459,11 +5648,9 @@ struct llm_build_context {
5459
  );
5460
  cb(Kcur, "Kcur", il);
5461
 
5462
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5463
-
5464
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5465
  model.layers[il].wo, NULL,
5466
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5467
  cb(cur, "kqv_out", il);
5468
  }
5469
 
@@ -5520,20 +5707,20 @@ struct llm_build_context {
5520
  struct ggml_tensor * cur;
5521
  struct ggml_tensor * inpL;
5522
 
5523
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5524
  cb(inpL, "inp_embd", -1);
5525
 
5526
  // inp_pos - contains the positions
5527
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5528
  cb(inp_pos, "inp_pos", -1);
5529
 
5530
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5531
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5532
  cb(KQ_mask, "KQ_mask", -1);
5533
 
5534
  // shift the entire K-cache if needed
5535
  if (do_rope_shift) {
5536
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5537
  }
5538
 
5539
  for (int il = 0; il < n_layer; ++il) {
@@ -5576,11 +5763,9 @@ struct llm_build_context {
5576
  );
5577
  cb(Kcur, "Kcur", il);
5578
 
5579
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5580
-
5581
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5582
  model.layers[il].wo, NULL,
5583
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5584
  cb(cur, "kqv_out", il);
5585
  }
5586
 
@@ -5625,6 +5810,126 @@ struct llm_build_context {
5625
 
5626
  return gf;
5627
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5628
  struct ggml_cgraph * build_phi2() {
5629
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5630
 
@@ -5637,20 +5942,20 @@ struct llm_build_context {
5637
  struct ggml_tensor * ffn_output;
5638
  struct ggml_tensor * inpL;
5639
 
5640
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5641
  cb(inpL, "inp_embd", -1);
5642
 
5643
  // inp_pos - contains the positions
5644
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5645
  cb(inp_pos, "inp_pos", -1);
5646
 
5647
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5648
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5649
  cb(KQ_mask, "KQ_mask", -1);
5650
 
5651
  // shift the entire K-cache if needed
5652
  if (do_rope_shift) {
5653
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5654
  }
5655
 
5656
  for (int il = 0; il < n_layer; ++il) {
@@ -5706,11 +6011,9 @@ struct llm_build_context {
5706
  );
5707
  cb(Kcur, "Kcur", il);
5708
 
5709
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5710
-
5711
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5712
  model.layers[il].wo, model.layers[il].bo,
5713
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
5714
  cb(cur, "kqv_out", il);
5715
  }
5716
 
@@ -5761,20 +6064,20 @@ struct llm_build_context {
5761
  struct ggml_tensor * cur;
5762
  struct ggml_tensor * inpL;
5763
 
5764
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5765
  cb(inpL, "inp_embd", -1);
5766
 
5767
  // inp_pos - contains the positions
5768
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5769
  cb(inp_pos, "inp_pos", -1);
5770
 
5771
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5772
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5773
  cb(KQ_mask, "KQ_mask", -1);
5774
 
5775
  // shift the entire K-cache if needed
5776
  if (do_rope_shift) {
5777
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5778
  }
5779
 
5780
  for (int il = 0; il < n_layer; ++il) {
@@ -5811,11 +6114,9 @@ struct llm_build_context {
5811
  ext_factor, attn_factor, beta_fast, beta_slow);
5812
  cb(Kcur, "Kcur", il);
5813
 
5814
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5815
-
5816
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5817
  model.layers[il].wo, NULL,
5818
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5819
  cb(cur, "kqv_out", il);
5820
  }
5821
  struct ggml_tensor * sa_out = cur;
@@ -5870,15 +6171,15 @@ struct llm_build_context {
5870
  struct ggml_tensor * pos;
5871
  struct ggml_tensor * inpL;
5872
 
5873
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5874
  cb(inpL, "inp_embd", -1);
5875
 
5876
  // inp_pos - contains the positions
5877
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5878
  cb(inp_pos, "inp_pos", -1);
5879
 
5880
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5881
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5882
  cb(KQ_mask, "KQ_mask", -1);
5883
 
5884
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
@@ -5912,11 +6213,118 @@ struct llm_build_context {
5912
 
5913
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5914
 
5915
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5916
 
5917
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5918
  model.layers[il].wo, model.layers[il].bo,
5919
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5920
  cb(cur, "kqv_out", il);
5921
  }
5922
 
@@ -5968,15 +6376,7 @@ static struct ggml_cgraph * llama_build_graph(
5968
  // check if we should build the worst-case graph (for memory measurement)
5969
  const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
5970
 
5971
- // keep track of the input that has already been allocated
5972
- bool alloc_inp_tokens = false;
5973
- bool alloc_inp_embd = false;
5974
- bool alloc_inp_pos = false;
5975
- bool alloc_inp_KQ_mask = false;
5976
- bool alloc_inp_K_shift = false;
5977
-
5978
  // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
5979
- // TODO: improve handling of input and output tensors, then replace this with ggml_set_name
5980
  llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
5981
  if (il >= 0) {
5982
  ggml_format_name(cur, "%s-%d", name, il);
@@ -5984,118 +6384,78 @@ static struct ggml_cgraph * llama_build_graph(
5984
  ggml_set_name(cur, name);
5985
  }
5986
 
5987
- //
5988
- // allocate input tensors and set input data
5989
- //
5990
-
5991
- if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
5992
- ggml_tallocr_alloc(lctx.alloc, cur);
5993
-
5994
- if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
5995
- const int64_t n_tokens = cur->ne[0];
5996
-
5997
- ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
5998
  }
5999
-
6000
- alloc_inp_tokens = true;
6001
  }
 
 
 
6002
 
6003
- if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
6004
- ggml_tallocr_alloc(lctx.alloc, cur);
6005
 
6006
- if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
6007
- const int64_t n_embd = cur->ne[0];
6008
- const int64_t n_tokens = cur->ne[1];
6009
 
6010
- ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
6011
- }
 
6012
 
6013
- alloc_inp_embd = true;
6014
  }
6015
 
6016
- if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
6017
- ggml_tallocr_alloc(lctx.alloc, cur);
 
6018
 
6019
- if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
6020
- const int64_t n_tokens = cur->ne[0];
6021
 
6022
- static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
6023
- ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
6024
- }
6025
 
6026
- alloc_inp_pos = true;
6027
  }
6028
 
6029
- if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
6030
- ggml_tallocr_alloc(lctx.alloc, cur);
 
6031
 
6032
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
6033
- const int64_t n_kv = cur->ne[0];
6034
- const int64_t n_tokens = cur->ne[1];
6035
 
6036
- float * data;
6037
- if (ggml_backend_buffer_is_host(cur->buffer)) {
6038
- data = (float *) cur->data;
6039
- } else {
6040
- lctx.buf_copy.resize(ggml_nbytes(cur));
6041
- data = (float *) lctx.buf_copy.data();
6042
- }
6043
 
6044
- for (int h = 0; h < 1; ++h) {
6045
- for (int j = 0; j < n_tokens; ++j) {
6046
- const llama_pos pos = batch.pos[j];
6047
- const llama_seq_id seq_id = batch.seq_id[j][0];
6048
-
6049
- for (int i = 0; i < n_kv; ++i) {
6050
- float f;
6051
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
6052
- f = -INFINITY;
6053
- } else {
6054
- f = 0;
6055
- }
6056
- data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
6057
  }
 
6058
  }
6059
  }
6060
-
6061
- if (data != cur->data) {
6062
- ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
6063
- }
6064
  }
6065
-
6066
- alloc_inp_KQ_mask = true;
6067
  }
6068
 
6069
- if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
6070
- ggml_tallocr_alloc(lctx.alloc, cur);
6071
-
6072
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
6073
- const int64_t n_ctx = cur->ne[0];
6074
 
6075
- int32_t * data;
6076
- if (ggml_backend_buffer_is_host(cur->buffer)) {
6077
- data = (int32_t *) cur->data;
6078
- } else {
6079
- lctx.buf_copy.resize(ggml_nbytes(cur));
6080
- data = (int32_t *) lctx.buf_copy.data();
6081
- }
6082
 
6083
- for (int i = 0; i < n_ctx; ++i) {
6084
- data[i] = lctx.kv_self.cells[i].delta;
6085
- }
6086
-
6087
- if (data != cur->data) {
6088
- ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
6089
- }
6090
  }
6091
-
6092
- alloc_inp_K_shift = true;
6093
  }
6094
- };
6095
-
6096
- struct ggml_cgraph * result = NULL;
6097
-
6098
- struct llm_build_context llm(lctx, batch, cb, worst_case);
6099
 
6100
  llm.init();
6101
 
@@ -6140,6 +6500,10 @@ static struct ggml_cgraph * llama_build_graph(
6140
  {
6141
  result = llm.build_qwen();
6142
  } break;
 
 
 
 
6143
  case LLM_ARCH_PHI2:
6144
  {
6145
  result = llm.build_phi2();
@@ -6152,6 +6516,10 @@ static struct ggml_cgraph * llama_build_graph(
6152
  {
6153
  result = llm.build_gpt2();
6154
  } break;
 
 
 
 
6155
  default:
6156
  GGML_ASSERT(false);
6157
  }
@@ -7588,10 +7956,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
7588
  auto comp = [](const llama_token_data & a, const llama_token_data & b) {
7589
  return a.logit > b.logit;
7590
  };
7591
- if (k == (int) candidates->size) {
7592
- std::sort(candidates->data, candidates->data + candidates->size, comp);
7593
- } else {
7594
  std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7595
  }
7596
  candidates->sorted = true;
7597
  }
@@ -7783,6 +8198,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
7783
  }
7784
  }
7785
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7786
  void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
7787
  const int64_t t_start_sample_us = ggml_time_us();
7788
 
@@ -8371,9 +8853,13 @@ struct quantize_state_internal {
8371
  const llama_model_quantize_params * params;
8372
 
8373
  int n_attention_wv = 0;
8374
- int n_feed_forward_w2 = 0;
 
 
8375
  int i_attention_wv = 0;
8376
- int i_feed_forward_w2 = 0;
 
 
8377
 
8378
  int n_k_quantized = 0;
8379
  int n_fallback = 0;
@@ -8457,6 +8943,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8457
  auto use_more_bits = [](int i_layer, int num_layers) -> bool {
8458
  return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
8459
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8460
 
8461
  if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
8462
  int nx = tensor->ne[0];
@@ -8476,8 +8979,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8476
  ++qs.i_attention_wv;
8477
  }
8478
  else if (name.find("ffn_down") != std::string::npos) {
8479
- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
8480
- ++qs.i_feed_forward_w2;
8481
  }
8482
  else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8483
  } else if (name.find("attn_v.weight") != std::string::npos) {
@@ -8514,27 +9017,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8514
  // TODO: explore better strategies
8515
  new_type = GGML_TYPE_Q8_0;
8516
  }
8517
- } else if (name.find("ffn_down") != std::string::npos) {
8518
- const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8519
- int i_layer, n_layer;
8520
- if (n_expert == 1) {
8521
- i_layer = qs.i_feed_forward_w2;
8522
- n_layer = qs.n_feed_forward_w2;
8523
- } else {
8524
- // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8525
- // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8526
- // for getting the current layer as I initially thought, and we need to resort to parsing the
8527
- // tensor name.
8528
- n_layer = qs.n_feed_forward_w2 / n_expert;
8529
- if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
8530
- throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
8531
- }
8532
- if (i_layer < 0 || i_layer >= n_layer) {
8533
- throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
8534
- }
8535
  }
 
 
 
8536
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8537
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8538
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
8539
  }
8540
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8564,11 +9054,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8564
  // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
8565
  new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
8566
  }
8567
- ++qs.i_feed_forward_w2;
8568
  } else if (name.find("attn_output.weight") != std::string::npos) {
8569
  if (arch != LLM_ARCH_FALCON) {
8570
  if (qs.model.hparams.n_expert == 8) {
8571
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
 
8572
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8573
  new_type = GGML_TYPE_Q5_K;
8574
  }
@@ -8586,6 +9077,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8586
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
8587
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8588
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8589
  // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
8590
  //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
8591
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -8640,8 +9149,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8640
  case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
8641
 
8642
  // K-quants
 
8643
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
8644
- case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
8645
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
8646
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
8647
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -8709,12 +9219,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8709
  ++qs.n_attention_wv;
8710
  }
8711
  else if (name.find("ffn_down") != std::string::npos) {
8712
- ++qs.n_feed_forward_w2;
 
 
 
 
 
 
8713
  }
8714
  }
8715
- if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
8716
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
8717
- __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
8718
  }
8719
 
8720
  size_t total_size_org = 0;
@@ -9522,6 +10038,35 @@ struct llama_context * llama_new_context_with_model(
9522
  ctx->embedding.resize(hparams.n_embd);
9523
  }
9524
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9525
  {
9526
  // buffer types used for the compute buffer of each backend
9527
  std::vector<ggml_backend_buffer_type_t> backend_buft;
@@ -9548,9 +10093,6 @@ struct llama_context * llama_new_context_with_model(
9548
 
9549
  // initialize scheduler with the worst-case graph
9550
  ggml_backend_sched_init_measure(ctx->sched, gf);
9551
- // note: the number of splits during measure is higher than during inference due to the kv shift
9552
- int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
9553
- LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
9554
  ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
9555
 
9556
  for (ggml_backend_t backend : ctx->backends) {
@@ -9559,6 +10101,10 @@ struct llama_context * llama_new_context_with_model(
9559
  ggml_backend_buffer_name(buf),
9560
  ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
9561
  }
 
 
 
 
9562
  }
9563
  }
9564
 
 
192
  LLM_ARCH_BLOOM,
193
  LLM_ARCH_STABLELM,
194
  LLM_ARCH_QWEN,
195
+ LLM_ARCH_QWEN2,
196
  LLM_ARCH_PHI2,
197
  LLM_ARCH_PLAMO,
198
+ LLM_ARCH_CODESHELL,
199
  LLM_ARCH_UNKNOWN,
200
  };
201
 
 
213
  { LLM_ARCH_BLOOM, "bloom" },
214
  { LLM_ARCH_STABLELM, "stablelm" },
215
  { LLM_ARCH_QWEN, "qwen" },
216
+ { LLM_ARCH_QWEN2, "qwen2" },
217
  { LLM_ARCH_PHI2, "phi2" },
218
  { LLM_ARCH_PLAMO, "plamo" },
219
+ { LLM_ARCH_CODESHELL, "codeshell" },
220
  };
221
 
222
  enum llm_kv {
 
570
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
571
  },
572
  },
573
+ {
574
+ LLM_ARCH_QWEN2,
575
+ {
576
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
577
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
578
+ { LLM_TENSOR_OUTPUT, "output" },
579
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
580
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
581
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
582
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
583
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
584
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
585
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
586
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
587
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
588
+ },
589
+ },
590
  {
591
  LLM_ARCH_PHI2,
592
  {
 
621
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
622
  },
623
  },
624
+ {
625
+ LLM_ARCH_CODESHELL,
626
+ {
627
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
628
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
629
+ { LLM_TENSOR_OUTPUT, "output" },
630
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
631
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
632
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
633
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
634
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
635
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
636
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
637
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
638
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
639
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
640
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
641
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
642
+ },
643
+ },
644
 
645
  {
646
  LLM_ARCH_UNKNOWN,
 
1325
  // available llama models
1326
  enum e_model {
1327
  MODEL_UNKNOWN,
1328
+ MODEL_0_5B,
1329
  MODEL_1B,
1330
  MODEL_3B,
1331
+ MODEL_4B,
1332
  MODEL_7B,
1333
  MODEL_8B,
1334
  MODEL_13B,
 
1642
  std::unique_ptr<llama_mmap> mapping;
1643
 
1644
  // objects representing data potentially being locked in memory
1645
+ std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
1646
  llama_mlock mlock_mmap;
1647
 
1648
  // for quantize-stats only
 
1669
  for (ggml_backend_t backend : backends) {
1670
  ggml_backend_free(backend);
1671
  }
1672
+
1673
+ ggml_backend_buffer_free(buf_input);
1674
+ ggml_free(ctx_input);
1675
  }
1676
 
1677
  llama_cparams cparams;
 
1718
  // allocator for the input tensors
1719
  ggml_tallocr * alloc = nullptr;
1720
 
1721
+ // input tensors
1722
+ ggml_backend_buffer_t buf_input = nullptr;
1723
+ ggml_context * ctx_input = nullptr;
1724
+ struct ggml_tensor * inp_tokens; // I32 [n_batch]
1725
+ struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
1726
+ struct ggml_tensor * inp_pos; // I32 [n_batch]
1727
+ struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1728
+ struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1729
 
1730
  #ifdef GGML_USE_MPI
1731
  ggml_mpi_context * ctx_mpi = NULL;
 
2309
  }
2310
 
2311
  switch (type_max) {
2312
+ case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
2313
+ case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
2314
+ case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
2315
+ case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
2316
+ case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
2317
+ case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
2318
+ case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
2319
+ case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
2320
+ case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
2321
+ case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
2322
+ case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
2323
+ case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2324
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2325
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2326
  default:
 
2670
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2671
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2672
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2673
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2674
 
2675
  default: return "unknown, may not work";
2676
  }
 
2886
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2887
 
2888
  switch (hparams.n_layer) {
2889
+ case 24: model.type = e_model::MODEL_1B; break;
2890
  case 32: model.type = e_model::MODEL_3B; break;
2891
  default: model.type = e_model::MODEL_UNKNOWN;
2892
  }
 
2901
  default: model.type = e_model::MODEL_UNKNOWN;
2902
  }
2903
  } break;
2904
+ case LLM_ARCH_QWEN2:
2905
+ {
2906
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2907
+ switch (hparams.n_layer) {
2908
+ case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
2909
+ case 32: model.type = e_model::MODEL_7B; break;
2910
+ case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
2911
+ case 80: model.type = e_model::MODEL_70B; break;
2912
+ default: model.type = e_model::MODEL_UNKNOWN;
2913
+ }
2914
+ } break;
2915
  case LLM_ARCH_PHI2:
2916
  {
2917
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
2942
  default: model.type = e_model::MODEL_UNKNOWN;
2943
  }
2944
  } break;
2945
+ case LLM_ARCH_CODESHELL:
2946
+ {
2947
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2948
+ switch (hparams.n_layer) {
2949
+ case 42: model.type = e_model::MODEL_SMALL; break;
2950
+ default: model.type = e_model::MODEL_UNKNOWN;
2951
+ }
2952
+ } break;
2953
 
2954
  default: (void)0;
2955
  }
 
3511
  {
3512
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3513
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3514
+ if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
3515
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3516
+ } else {
3517
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
3518
+ ml.n_created--; // artificial tensor
3519
+ }
3520
  }
3521
 
3522
  for (int i = 0; i < n_layer; ++i) {
 
3710
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3711
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3712
 
3713
+ // optional bias tensors, present in Stable LM 2 1.6B
3714
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
3715
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
3716
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
3717
+
3718
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3719
  layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3720
 
 
3752
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
3753
  }
3754
  } break;
3755
+ case LLM_ARCH_QWEN2:
3756
+ {
3757
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3758
+
3759
+ // output
3760
+ {
3761
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3762
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3763
+ }
3764
+
3765
+ for (int i = 0; i < n_layer; ++i) {
3766
+ ggml_context * ctx_layer = ctx_for_layer(i);
3767
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3768
+
3769
+ auto & layer = model.layers[i];
3770
+
3771
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3772
+
3773
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3774
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3775
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3776
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3777
+
3778
+ // optional bias tensors
3779
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3780
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3781
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3782
+
3783
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3784
+
3785
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3786
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3787
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3788
+ }
3789
+ } break;
3790
  case LLM_ARCH_PHI2:
3791
  {
3792
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
3897
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3898
  }
3899
  } break;
3900
+ case LLM_ARCH_CODESHELL:
3901
+ {
3902
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3903
+
3904
+ // output
3905
+ {
3906
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3907
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3908
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3909
+ }
3910
+
3911
+ for (int i = 0; i < n_layer; ++i) {
3912
+ ggml_context * ctx_layer = ctx_for_layer(i);
3913
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3914
+
3915
+ auto & layer = model.layers[i];
3916
+
3917
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3918
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3919
+
3920
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3921
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3922
+
3923
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3924
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3925
+
3926
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3927
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3928
+
3929
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3930
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3931
+
3932
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3933
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3934
+ }
3935
+ } break;
3936
  default:
3937
  throw std::runtime_error("unknown architecture");
3938
  }
 
3969
  else {
3970
  buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
3971
  if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
3972
+ model.mlock_bufs.emplace_back(new llama_mlock);
3973
+ auto & mlock_buf = model.mlock_bufs.back();
3974
+ mlock_buf->init (ggml_backend_buffer_get_base(buf));
3975
+ mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
3976
  }
3977
  }
3978
  if (buf == nullptr) {
 
4098
  const llama_hparams & hparams,
4099
  const llama_batch & batch,
4100
  struct ggml_tensor * tok_embd,
4101
+ struct ggml_tensor * inp_tokens,
4102
+ struct ggml_tensor * inp_embd,
4103
  const llm_build_cb & cb) {
4104
  const int64_t n_embd = hparams.n_embd;
4105
 
4106
  struct ggml_tensor * inpL;
4107
 
4108
  if (batch.token) {
4109
+ struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
4110
  cb(inp_tokens, "inp_tokens", -1);
4111
 
4112
+ inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
4113
  } else {
4114
  #ifdef GGML_USE_MPI
4115
  GGML_ASSERT(false && "not implemented");
4116
  #endif
4117
 
4118
+ inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
4119
  }
4120
 
4121
  return inpL;
 
4129
  const llama_cparams & cparams,
4130
  const llama_kv_cache & kv,
4131
  struct ggml_cgraph * graph,
4132
+ struct ggml_tensor * K_shift,
4133
  llm_rope_type type,
4134
  int64_t n_ctx,
4135
  float freq_base,
 
4146
  const float beta_fast = cparams.yarn_beta_fast;
4147
  const float beta_slow = cparams.yarn_beta_slow;
4148
 
 
 
 
4149
  int rope_type = 0;
4150
 
4151
  switch (type) {
 
4333
  const llama_model & model,
4334
  const llama_hparams & hparams,
4335
  const llama_kv_cache & kv,
4336
+ struct ggml_cgraph * graph,
4337
  struct ggml_tensor * wo,
4338
  struct ggml_tensor * wo_b,
4339
  struct ggml_tensor * q_cur,
 
4412
  struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
4413
  cb(cur, "kqv_merged_cont", il);
4414
 
4415
+ ggml_build_forward_expand(graph, cur);
4416
+
4417
  cur = ggml_mul_mat(ctx, wo, cur);
4418
  if (wo_b) {
4419
  cb(cur, "kqv_wo", il);
 
4426
  return cur;
4427
  }
4428
 
4429
+ static struct ggml_tensor * llm_build_kv(
4430
+ struct ggml_context * ctx,
4431
+ const llama_model & model,
4432
+ const llama_hparams & hparams,
4433
+ const llama_kv_cache & kv,
4434
+ struct ggml_cgraph * graph,
4435
+ struct ggml_tensor * wo,
4436
+ struct ggml_tensor * wo_b,
4437
+ struct ggml_tensor * k_cur,
4438
+ struct ggml_tensor * v_cur,
4439
+ struct ggml_tensor * q_cur,
4440
+ struct ggml_tensor * kq_mask,
4441
+ int64_t n_ctx,
4442
+ int32_t n_tokens,
4443
+ int32_t kv_head,
4444
+ int32_t n_kv,
4445
+ float max_alibi_bias,
4446
+ float kq_scale,
4447
+ const llm_build_cb & cb,
4448
+ int il) {
4449
+
4450
+ // these nodes are added to the graph together so that they are not reordered
4451
+ // by doing so, the number of splits in the graph is reduced
4452
+ ggml_build_forward_expand(graph, q_cur);
4453
+ ggml_build_forward_expand(graph, k_cur);
4454
+ ggml_build_forward_expand(graph, v_cur);
4455
+
4456
+ llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4457
+
4458
+ struct ggml_tensor * cur;
4459
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph,
4460
+ wo, wo_b,
4461
+ q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
4462
+ cb(cur, "kqv_out", il);
4463
+
4464
+ return cur;
4465
+ }
4466
+
4467
  struct llm_build_context {
4468
  const llama_model & model;
4469
+ const llama_context & lctx;
4470
  const llama_hparams & hparams;
4471
  const llama_cparams & cparams;
4472
  const llama_batch & batch;
 
4513
  const llm_build_cb & cb,
4514
  bool worst_case) :
4515
  model (lctx.model),
4516
+ lctx (lctx),
4517
  hparams (model.hparams),
4518
  cparams (lctx.cparams),
4519
  batch (batch),
 
4574
  struct ggml_tensor * cur;
4575
  struct ggml_tensor * inpL;
4576
 
4577
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4578
  cb(inpL, "inp_embd", -1);
4579
 
4580
  // inp_pos - contains the positions
4581
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4582
  cb(inp_pos, "inp_pos", -1);
4583
 
4584
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4585
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4586
  cb(KQ_mask, "KQ_mask", -1);
4587
 
4588
  // shift the entire K-cache if needed
4589
  if (do_rope_shift) {
4590
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4591
  }
4592
 
4593
  for (int il = 0; il < n_layer; ++il) {
 
4623
  cb(Vcur, "Vcur", il);
4624
  }
4625
 
 
 
 
 
 
 
4626
  Qcur = ggml_rope_custom(
4627
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4628
  hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
 
4637
  );
4638
  cb(Kcur, "Kcur", il);
4639
 
4640
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
4641
  model.layers[il].wo, model.layers[il].bo,
4642
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4643
  cb(cur, "kqv_out", il);
4644
  }
4645
 
 
4758
  struct ggml_tensor * cur;
4759
  struct ggml_tensor * inpL;
4760
 
4761
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4762
  cb(inpL, "inp_embd", -1);
4763
 
4764
  // inp_pos - contains the positions
4765
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4766
  cb(inp_pos, "inp_pos", -1);
4767
 
4768
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4769
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4770
  cb(KQ_mask, "KQ_mask", -1);
4771
 
4772
  // shift the entire K-cache if needed
4773
  if (do_rope_shift) {
4774
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4775
  }
4776
 
4777
  for (int il = 0; il < n_layer; ++il) {
 
4816
  cb(Qcur, "Qcur", il);
4817
  cb(Kcur, "Kcur", il);
4818
 
 
4819
 
4820
  // apply ALiBi for 13B model
4821
  const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
4822
 
4823
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
4824
  model.layers[il].wo, NULL,
4825
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4826
  cb(cur, "kqv_out", il);
4827
  }
4828
 
 
4879
  struct ggml_tensor * cur;
4880
  struct ggml_tensor * inpL;
4881
 
4882
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4883
  cb(inpL, "inp_embd", -1);
4884
 
4885
  // inp_pos - contains the positions
4886
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4887
  cb(inp_pos, "inp_pos", -1);
4888
 
4889
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4890
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4891
  cb(KQ_mask, "KQ_mask", -1);
4892
 
4893
  // shift the entire K-cache if needed
4894
  if (do_rope_shift) {
4895
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
4896
  }
4897
 
4898
  for (int il = 0; il < n_layer; ++il) {
 
4944
  );
4945
  cb(Kcur, "Kcur", il);
4946
 
4947
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
4948
  model.layers[il].wo, NULL,
4949
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4950
  cb(cur, "kqv_out", il);
4951
  }
4952
 
 
5001
  struct ggml_tensor * pos;
5002
  struct ggml_tensor * inpL;
5003
 
5004
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5005
  cb(inpL, "inp_embd", -1);
5006
 
5007
  // inp_pos - contains the positions
5008
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5009
  cb(inp_pos, "inp_pos", -1);
5010
 
5011
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5012
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5013
  cb(KQ_mask, "KQ_mask", -1);
5014
 
5015
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
 
5043
 
5044
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5045
 
5046
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
5047
  model.layers[il].wo, model.layers[il].bo,
5048
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5049
  cb(cur, "kqv_out", il);
5050
  }
5051
 
 
5098
  struct ggml_tensor * cur;
5099
  struct ggml_tensor * inpL;
5100
 
5101
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5102
  cb(inpL, "inp_embd", -1);
5103
 
5104
  // inp_pos - contains the positions
5105
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5106
  cb(inp_pos, "inp_pos", -1);
5107
 
5108
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5109
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5110
  cb(KQ_mask, "KQ_mask", -1);
5111
 
5112
  if (do_rope_shift) {
5113
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5114
  }
5115
 
5116
  for (int il = 0; il < n_layer; ++il) {
 
5248
  );
5249
  cb(Vcur, "Vcur", il);
5250
 
5251
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
 
5252
  model.layers[il].wo, model.layers[il].bo,
5253
+ Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5254
  cb(cur, "kqv_out", il);
5255
  }
5256
 
 
5305
  struct ggml_tensor * cur;
5306
  struct ggml_tensor * inpL;
5307
 
5308
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5309
  cb(inpL, "inp_embd", -1);
5310
 
5311
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5312
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5313
  cb(KQ_mask, "KQ_mask", -1);
5314
 
5315
  for (int il = 0; il < n_layer; ++il) {
 
5337
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5338
  cb(Qcur, "Qcur", il);
5339
 
5340
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
5341
  model.layers[il].wo, NULL,
5342
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5343
  cb(cur, "kqv_out", il);
5344
  }
5345
 
 
5395
  struct ggml_tensor * cur;
5396
  struct ggml_tensor * inpL;
5397
 
5398
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5399
  cb(inpL, "inp_embd", -1);
5400
 
5401
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5402
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5403
  cb(KQ_mask, "KQ_mask", -1);
5404
 
5405
  inpL = llm_build_norm(ctx0, inpL, hparams,
 
5433
 
5434
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5435
 
5436
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
5437
  model.layers[il].wo, model.layers[il].bo,
5438
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5439
  cb(cur, "kqv_out", il);
5440
  }
5441
 
 
5488
  struct ggml_tensor * cur;
5489
  struct ggml_tensor * inpL;
5490
 
5491
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5492
  cb(inpL, "inp_embd", -1);
5493
 
5494
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5495
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5496
  cb(KQ_mask, "KQ_mask", -1);
5497
 
5498
  for (int il = 0; il < n_layer; ++il) {
 
5526
 
5527
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5528
 
5529
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
5530
  model.layers[il].wo, NULL,
5531
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5532
  cb(cur, "kqv_out", il);
5533
  }
5534
 
 
5584
  struct ggml_tensor * cur;
5585
  struct ggml_tensor * inpL;
5586
 
5587
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5588
  cb(inpL, "inp_embd", -1);
5589
 
5590
  // inp_pos - contains the positions
5591
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5592
  cb(inp_pos, "inp_pos", -1);
5593
 
5594
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5595
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5596
  cb(KQ_mask, "KQ_mask", -1);
5597
 
5598
  // shift the entire K-cache if needed
5599
  if (do_rope_shift) {
5600
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5601
  }
5602
 
5603
  for (int il = 0; il < n_layer; ++il) {
 
5615
  // compute Q and K and RoPE them
5616
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
5617
  cb(Qcur, "Qcur", il);
5618
+ if (model.layers[il].bq) {
5619
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
5620
+ cb(Qcur, "Qcur", il);
5621
+ }
5622
 
5623
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
5624
  cb(Kcur, "Kcur", il);
5625
+ if (model.layers[il].bk) {
5626
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
5627
+ cb(Kcur, "Kcur", il);
5628
+ }
5629
 
5630
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5631
  cb(Vcur, "Vcur", il);
5632
+ if (model.layers[il].bv) {
5633
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
5634
+ cb(Vcur, "Vcur", il);
5635
+ }
5636
 
5637
  Qcur = ggml_rope_custom(
5638
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
 
5648
  );
5649
  cb(Kcur, "Kcur", il);
5650
 
5651
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
5652
  model.layers[il].wo, NULL,
5653
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5654
  cb(cur, "kqv_out", il);
5655
  }
5656
 
 
5707
  struct ggml_tensor * cur;
5708
  struct ggml_tensor * inpL;
5709
 
5710
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5711
  cb(inpL, "inp_embd", -1);
5712
 
5713
  // inp_pos - contains the positions
5714
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5715
  cb(inp_pos, "inp_pos", -1);
5716
 
5717
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5718
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5719
  cb(KQ_mask, "KQ_mask", -1);
5720
 
5721
  // shift the entire K-cache if needed
5722
  if (do_rope_shift) {
5723
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5724
  }
5725
 
5726
  for (int il = 0; il < n_layer; ++il) {
 
5763
  );
5764
  cb(Kcur, "Kcur", il);
5765
 
5766
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
5767
  model.layers[il].wo, NULL,
5768
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5769
  cb(cur, "kqv_out", il);
5770
  }
5771
 
 
5810
 
5811
  return gf;
5812
  }
5813
+
5814
+ struct ggml_cgraph * build_qwen2() {
5815
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5816
+
5817
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5818
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5819
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5820
+
5821
+ struct ggml_tensor * cur;
5822
+ struct ggml_tensor * inpL;
5823
+
5824
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5825
+ cb(inpL, "inp_embd", -1);
5826
+
5827
+ // inp_pos - contains the positions
5828
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5829
+ cb(inp_pos, "inp_pos", -1);
5830
+
5831
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5832
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5833
+ cb(KQ_mask, "KQ_mask", -1);
5834
+
5835
+ // shift the entire K-cache if needed
5836
+ if (do_rope_shift) {
5837
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5838
+ }
5839
+
5840
+ for (int il = 0; il < n_layer; ++il) {
5841
+ struct ggml_tensor * inpSA = inpL;
5842
+
5843
+ // norm
5844
+ cur = llm_build_norm(ctx0, inpL, hparams,
5845
+ model.layers[il].attn_norm, NULL,
5846
+ LLM_NORM_RMS, cb, il);
5847
+ cb(cur, "attn_norm", il);
5848
+
5849
+ // self-attention
5850
+ {
5851
+ // compute Q and K and RoPE them
5852
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
5853
+ cb(Qcur, "Qcur", il);
5854
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
5855
+ cb(Qcur, "Qcur", il);
5856
+
5857
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
5858
+ cb(Kcur, "Kcur", il);
5859
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
5860
+ cb(Kcur, "Kcur", il);
5861
+
5862
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5863
+ cb(Vcur, "Vcur", il);
5864
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
5865
+ cb(Vcur, "Vcur", il);
5866
+
5867
+ // these nodes are added to the graph together so that they are not reordered
5868
+ // by doing so, the number of splits in the graph is reduced
5869
+ ggml_build_forward_expand(gf, Qcur);
5870
+ ggml_build_forward_expand(gf, Kcur);
5871
+ ggml_build_forward_expand(gf, Vcur);
5872
+
5873
+ Qcur = ggml_rope_custom(
5874
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5875
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5876
+ ext_factor, attn_factor, beta_fast, beta_slow
5877
+ );
5878
+ cb(Qcur, "Qcur", il);
5879
+
5880
+ Kcur = ggml_rope_custom(
5881
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5882
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5883
+ ext_factor, attn_factor, beta_fast, beta_slow
5884
+ );
5885
+ cb(Kcur, "Kcur", il);
5886
+
5887
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5888
+ model.layers[il].wo, model.layers[il].bo,
5889
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5890
+ cb(cur, "kqv_out", il);
5891
+ }
5892
+
5893
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5894
+ cb(ffn_inp, "ffn_inp", il);
5895
+
5896
+ // feed-forward network
5897
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
5898
+ model.layers[il].ffn_norm, NULL,
5899
+ LLM_NORM_RMS, cb, il);
5900
+ cb(cur, "ffn_norm", il);
5901
+
5902
+ cur = llm_build_ffn(ctx0, cur,
5903
+ model.layers[il].ffn_up, NULL,
5904
+ model.layers[il].ffn_gate, NULL,
5905
+ model.layers[il].ffn_down, NULL,
5906
+ NULL,
5907
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5908
+ cb(cur, "ffn_out", il);
5909
+
5910
+ cur = ggml_add(ctx0, cur, ffn_inp);
5911
+ cb(cur, "l_out", il);
5912
+
5913
+ // input for next layer
5914
+ inpL = cur;
5915
+ }
5916
+
5917
+ cur = inpL;
5918
+
5919
+ cur = llm_build_norm(ctx0, cur, hparams,
5920
+ model.output_norm, NULL,
5921
+ LLM_NORM_RMS, cb, -1);
5922
+ cb(cur, "result_norm", -1);
5923
+
5924
+ // lm_head
5925
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5926
+ cb(cur, "result_output", -1);
5927
+
5928
+ ggml_build_forward_expand(gf, cur);
5929
+
5930
+ return gf;
5931
+ }
5932
+
5933
  struct ggml_cgraph * build_phi2() {
5934
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5935
 
 
5942
  struct ggml_tensor * ffn_output;
5943
  struct ggml_tensor * inpL;
5944
 
5945
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5946
  cb(inpL, "inp_embd", -1);
5947
 
5948
  // inp_pos - contains the positions
5949
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5950
  cb(inp_pos, "inp_pos", -1);
5951
 
5952
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5953
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5954
  cb(KQ_mask, "KQ_mask", -1);
5955
 
5956
  // shift the entire K-cache if needed
5957
  if (do_rope_shift) {
5958
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5959
  }
5960
 
5961
  for (int il = 0; il < n_layer; ++il) {
 
6011
  );
6012
  cb(Kcur, "Kcur", il);
6013
 
6014
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
6015
  model.layers[il].wo, model.layers[il].bo,
6016
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
6017
  cb(cur, "kqv_out", il);
6018
  }
6019
 
 
6064
  struct ggml_tensor * cur;
6065
  struct ggml_tensor * inpL;
6066
 
6067
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6068
  cb(inpL, "inp_embd", -1);
6069
 
6070
  // inp_pos - contains the positions
6071
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6072
  cb(inp_pos, "inp_pos", -1);
6073
 
6074
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6075
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6076
  cb(KQ_mask, "KQ_mask", -1);
6077
 
6078
  // shift the entire K-cache if needed
6079
  if (do_rope_shift) {
6080
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6081
  }
6082
 
6083
  for (int il = 0; il < n_layer; ++il) {
 
6114
  ext_factor, attn_factor, beta_fast, beta_slow);
6115
  cb(Kcur, "Kcur", il);
6116
 
6117
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
6118
  model.layers[il].wo, NULL,
6119
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6120
  cb(cur, "kqv_out", il);
6121
  }
6122
  struct ggml_tensor * sa_out = cur;
 
6171
  struct ggml_tensor * pos;
6172
  struct ggml_tensor * inpL;
6173
 
6174
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6175
  cb(inpL, "inp_embd", -1);
6176
 
6177
  // inp_pos - contains the positions
6178
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6179
  cb(inp_pos, "inp_pos", -1);
6180
 
6181
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6182
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6183
  cb(KQ_mask, "KQ_mask", -1);
6184
 
6185
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
 
6213
 
6214
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6215
 
6216
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6217
+ model.layers[il].wo, model.layers[il].bo,
6218
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6219
+ cb(cur, "kqv_out", il);
6220
+ }
6221
+
6222
+ // add the input
6223
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6224
+ cb(ffn_inp, "ffn_inp", il);
6225
+
6226
+ // FF
6227
+ {
6228
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6229
+ model.layers[il].ffn_norm,
6230
+ model.layers[il].ffn_norm_b,
6231
+ LLM_NORM, cb, il);
6232
+ cb(cur, "ffn_norm", il);
6233
+
6234
+ cur = llm_build_ffn(ctx0, cur,
6235
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6236
+ NULL, NULL,
6237
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6238
+ NULL,
6239
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6240
+ cb(cur, "ffn_out", il);
6241
+ }
6242
+
6243
+ inpL = ggml_add(ctx0, cur, ffn_inp);
6244
+ cb(inpL, "l_out", il);
6245
+ }
6246
+
6247
+ cur = llm_build_norm(ctx0, inpL, hparams,
6248
+ model.output_norm,
6249
+ model.output_norm_b,
6250
+ LLM_NORM, cb, -1);
6251
+ cb(cur, "result_norm", -1);
6252
+
6253
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6254
+ cb(cur, "result_output", -1);
6255
+
6256
+ ggml_build_forward_expand(gf, cur);
6257
+
6258
+ return gf;
6259
+ }
6260
+
6261
+ struct ggml_cgraph * build_codeshell() {
6262
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6263
+
6264
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6265
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6266
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6267
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6268
+
6269
+ struct ggml_tensor * cur;
6270
+ struct ggml_tensor * inpL;
6271
+
6272
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6273
+ cb(inpL, "inp_embd", -1);
6274
+
6275
+ // inp_pos - contains the positions
6276
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6277
+ cb(inp_pos, "inp_pos", -1);
6278
+
6279
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6280
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6281
+ cb(KQ_mask, "KQ_mask", -1);
6282
+
6283
+ // shift the entire K-cache if needed
6284
+ if (do_rope_shift) {
6285
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6286
+ }
6287
+
6288
+ for (int il = 0; il < n_layer; ++il) {
6289
+ cur = llm_build_norm(ctx0, inpL, hparams,
6290
+ model.layers[il].attn_norm,
6291
+ model.layers[il].attn_norm_b,
6292
+ LLM_NORM, cb, il);
6293
+ cb(cur, "attn_norm", il);
6294
+
6295
+ // self-attention
6296
+ {
6297
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6298
+ cb(cur, "wqkv", il);
6299
+
6300
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6301
+ cb(cur, "bqkv", il);
6302
+
6303
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6304
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6305
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6306
+
6307
+ cb(tmpq, "tmpq", il);
6308
+ cb(tmpk, "tmpk", il);
6309
+ cb(Vcur, "Vcur", il);
6310
+
6311
+ struct ggml_tensor * Qcur = ggml_rope_custom(
6312
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
6313
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6314
+ ext_factor, attn_factor, beta_fast, beta_slow
6315
+ );
6316
+ cb(Qcur, "Qcur", il);
6317
+
6318
+ struct ggml_tensor * Kcur = ggml_rope_custom(
6319
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
6320
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6321
+ ext_factor, attn_factor, beta_fast, beta_slow
6322
+ );
6323
+ cb(Kcur, "Kcur", il);
6324
 
6325
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6326
  model.layers[il].wo, model.layers[il].bo,
6327
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6328
  cb(cur, "kqv_out", il);
6329
  }
6330
 
 
6376
  // check if we should build the worst-case graph (for memory measurement)
6377
  const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
6378
 
 
 
 
 
 
 
 
6379
  // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
 
6380
  llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
6381
  if (il >= 0) {
6382
  ggml_format_name(cur, "%s-%d", name, il);
 
6384
  ggml_set_name(cur, name);
6385
  }
6386
 
6387
+ if (!lctx.cparams.offload_kqv) {
6388
+ if (strcmp(name, "kqv_merged_cont") == 0) {
6389
+ // all nodes between the KV store and the attention output are run on the CPU
6390
+ ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
 
 
 
 
 
 
 
6391
  }
 
 
6392
  }
6393
+ };
6394
+
6395
+ struct ggml_cgraph * result = NULL;
6396
 
6397
+ struct llm_build_context llm(lctx, batch, cb, worst_case);
 
6398
 
6399
+ //
6400
+ // set input data
6401
+ //
6402
 
6403
+ if (!ggml_tallocr_is_measure(lctx.alloc)) {
6404
+ if (batch.token) {
6405
+ const int64_t n_tokens = batch.n_tokens;
6406
 
6407
+ ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
6408
  }
6409
 
6410
+ if (batch.embd) {
6411
+ const int64_t n_embd = llm.n_embd;
6412
+ const int64_t n_tokens = batch.n_tokens;
6413
 
6414
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
6415
+ }
6416
 
6417
+ if (batch.pos) {
6418
+ const int64_t n_tokens = batch.n_tokens;
 
6419
 
6420
+ ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
6421
  }
6422
 
6423
+ {
6424
+ const int64_t n_kv = llm.n_kv;
6425
+ const int64_t n_tokens = batch.n_tokens;
6426
 
6427
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
6428
+ float * data = (float *) lctx.inp_KQ_mask->data;
 
6429
 
6430
+ for (int h = 0; h < 1; ++h) {
6431
+ for (int j = 0; j < n_tokens; ++j) {
6432
+ const llama_pos pos = batch.pos[j];
6433
+ const llama_seq_id seq_id = batch.seq_id[j][0];
 
 
 
6434
 
6435
+ for (int i = 0; i < n_kv; ++i) {
6436
+ float f;
6437
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
6438
+ f = -INFINITY;
6439
+ } else {
6440
+ f = 0;
 
 
 
 
 
 
 
6441
  }
6442
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
6443
  }
6444
  }
 
 
 
 
6445
  }
 
 
6446
  }
6447
 
6448
+ if (llm.do_rope_shift) {
6449
+ const int64_t n_ctx = llm.n_ctx;
 
 
 
6450
 
6451
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
6452
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
 
 
 
 
 
6453
 
6454
+ for (int i = 0; i < n_ctx; ++i) {
6455
+ data[i] = lctx.kv_self.cells[i].delta;
 
 
 
 
 
6456
  }
 
 
6457
  }
6458
+ }
 
 
 
 
6459
 
6460
  llm.init();
6461
 
 
6500
  {
6501
  result = llm.build_qwen();
6502
  } break;
6503
+ case LLM_ARCH_QWEN2:
6504
+ {
6505
+ result = llm.build_qwen2();
6506
+ } break;
6507
  case LLM_ARCH_PHI2:
6508
  {
6509
  result = llm.build_phi2();
 
6516
  {
6517
  result = llm.build_gpt2();
6518
  } break;
6519
+ case LLM_ARCH_CODESHELL:
6520
+ {
6521
+ result = llm.build_codeshell();
6522
+ } break;
6523
  default:
6524
  GGML_ASSERT(false);
6525
  }
 
7956
  auto comp = [](const llama_token_data & a, const llama_token_data & b) {
7957
  return a.logit > b.logit;
7958
  };
7959
+ if (k <= 128) {
 
 
7960
  std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
7961
+ } else {
7962
+ constexpr int nbuckets = 128;
7963
+ constexpr float bucket_low = -10.0f;
7964
+ constexpr float bucket_high = 10.0f;
7965
+ constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
7966
+ constexpr float bucker_inter = -bucket_low * bucket_scale;
7967
+
7968
+ std::vector<int> bucket_idx(candidates->size);
7969
+ std::vector<int> histo(nbuckets, 0);
7970
+
7971
+ for (int i = 0; i < (int)candidates->size; ++i) {
7972
+ const float val = candidates->data[i].logit;
7973
+ int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
7974
+ ib = std::max(0, std::min(nbuckets-1, ib));
7975
+ bucket_idx[i] = ib;
7976
+ ++histo[ib];
7977
+ }
7978
+ int nhave = 0;
7979
+ int ib = nbuckets - 1;
7980
+ for ( ; ib >= 0; --ib) {
7981
+ nhave += histo[ib];
7982
+ if (nhave >= k) break;
7983
+ }
7984
+ std::vector<llama_token_data> tmp_tokens(nhave);
7985
+ auto ptr = tmp_tokens.data();
7986
+ std::vector<llama_token_data*> bucket_ptrs;
7987
+ bucket_ptrs.reserve(nbuckets - ib);
7988
+ for (int j = nbuckets - 1; j >= ib; --j) {
7989
+ bucket_ptrs.push_back(ptr);
7990
+ ptr += histo[j];
7991
+ }
7992
+ for (int i = 0; i < (int)candidates->size; ++i) {
7993
+ int j = bucket_idx[i];
7994
+ if (j >= ib) {
7995
+ *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
7996
+ }
7997
+ }
7998
+
7999
+ ptr = tmp_tokens.data();
8000
+ int ndone = 0;
8001
+ for (int j = nbuckets-1; j > ib; --j) {
8002
+ std::sort(ptr, ptr + histo[j], comp);
8003
+ ptr += histo[j];
8004
+ ndone += histo[j];
8005
+ }
8006
+ std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
8007
+
8008
+ std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
8009
+
8010
  }
8011
  candidates->sorted = true;
8012
  }
 
8198
  }
8199
  }
8200
 
8201
+ void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
8202
+ const int64_t t_start_sample_us = ggml_time_us();
8203
+
8204
+ // no need to do anything if there is only one (or zero) candidates
8205
+ if(candidates_p->size <= 1) {
8206
+ return;
8207
+ }
8208
+
8209
+ // Calculate maximum possible entropy
8210
+ float max_entropy = -logf(1.0f / candidates_p->size);
8211
+
8212
+ llama_sample_softmax(nullptr, candidates_p);
8213
+
8214
+ // Calculate entropy of the softmax probabilities
8215
+ float entropy = 0.0f;
8216
+ for (size_t i = 0; i < candidates_p->size; ++i) {
8217
+ float prob = candidates_p->data[i].p;
8218
+ if (prob > 0.0f) { // Ensure no log(0)
8219
+ entropy -= prob * logf(prob);
8220
+ }
8221
+ }
8222
+
8223
+ // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
8224
+ float normalized_entropy = entropy / max_entropy;
8225
+
8226
+ // Map the normalized entropy to the desired temperature range using the power function
8227
+ float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
8228
+
8229
+ #ifdef DEBUG
8230
+ LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
8231
+ LLAMA_LOG_INFO("Entropy: %f\n", entropy);
8232
+ LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
8233
+ LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
8234
+ LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
8235
+ LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
8236
+ #endif
8237
+
8238
+ // Apply the dynamically calculated temperature scaling
8239
+ for (size_t i = 0; i < candidates_p->size; ++i) {
8240
+ candidates_p->data[i].logit /= dyn_temp;
8241
+ }
8242
+
8243
+ // Re-compute softmax probabilities after scaling logits with dynamic temperature
8244
+ double max_l_double = candidates_p->data[0].logit;
8245
+ double cum_sum_double = 0.0;
8246
+ for (size_t i = 0; i < candidates_p->size; ++i) {
8247
+ double p = exp(candidates_p->data[i].logit - max_l_double);
8248
+ candidates_p->data[i].p = p; // Store the scaled probability
8249
+ cum_sum_double += p;
8250
+ }
8251
+ for (size_t i = 0; i < candidates_p->size; ++i) {
8252
+ candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
8253
+ }
8254
+
8255
+ #ifdef DEBUG
8256
+ // Print the updated top 25 probabilities after temperature scaling
8257
+ LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
8258
+ for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
8259
+ LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
8260
+ }
8261
+ #endif
8262
+
8263
+ if (ctx) {
8264
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
8265
+ }
8266
+ }
8267
+
8268
  void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
8269
  const int64_t t_start_sample_us = ggml_time_us();
8270
 
 
8853
  const llama_model_quantize_params * params;
8854
 
8855
  int n_attention_wv = 0;
8856
+ int n_ffn_down = 0;
8857
+ int n_ffn_gate = 0;
8858
+ int n_ffn_up = 0;
8859
  int i_attention_wv = 0;
8860
+ int i_ffn_down = 0;
8861
+ int i_ffn_gate = 0;
8862
+ int i_ffn_up = 0;
8863
 
8864
  int n_k_quantized = 0;
8865
  int n_fallback = 0;
 
8943
  auto use_more_bits = [](int i_layer, int num_layers) -> bool {
8944
  return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
8945
  };
8946
+ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8947
+ auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
8948
+ if (n_expert > 1) {
8949
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8950
+ // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
8951
+ // for getting the current layer as I initially thought, and we need to resort to parsing the
8952
+ // tensor name.
8953
+ n_layer /= n_expert;
8954
+ if (sscanf(name, "blk.%d.", &i_layer) != 1) {
8955
+ throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
8956
+ }
8957
+ if (i_layer < 0 || i_layer >= n_layer) {
8958
+ throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
8959
+ }
8960
+ }
8961
+ return std::make_pair(i_layer, n_layer);
8962
+ };
8963
 
8964
  if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
8965
  int nx = tensor->ne[0];
 
8979
  ++qs.i_attention_wv;
8980
  }
8981
  else if (name.find("ffn_down") != std::string::npos) {
8982
+ if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
8983
+ ++qs.i_ffn_down;
8984
  }
8985
  else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8986
  } else if (name.find("attn_v.weight") != std::string::npos) {
 
9017
  // TODO: explore better strategies
9018
  new_type = GGML_TYPE_Q8_0;
9019
  }
9020
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
9021
+ new_type = GGML_TYPE_Q2_K;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9022
  }
9023
+ } else if (name.find("ffn_down") != std::string::npos) {
9024
+ auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
9025
+ int i_layer = info.first, n_layer = info.second;
9026
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9027
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
9028
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
9029
  }
9030
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
 
9054
  // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
9055
  new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
9056
  }
9057
+ ++qs.i_ffn_down;
9058
  } else if (name.find("attn_output.weight") != std::string::npos) {
9059
  if (arch != LLM_ARCH_FALCON) {
9060
  if (qs.model.hparams.n_expert == 8) {
9061
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
9062
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
9063
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
9064
  new_type = GGML_TYPE_Q5_K;
9065
  }
 
9077
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
9078
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
9079
  }
9080
+ else if (name.find("ffn_gate") != std::string::npos) {
9081
+ auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
9082
+ int i_layer = info.first, n_layer = info.second;
9083
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
9084
+ new_type = GGML_TYPE_Q2_K;
9085
+ }
9086
+ ++qs.i_ffn_gate;
9087
+ }
9088
+ else if (name.find("ffn_up") != std::string::npos) {
9089
+ auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
9090
+ int i_layer = info.first, n_layer = info.second;
9091
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
9092
+ new_type = GGML_TYPE_Q2_K;
9093
+ }
9094
+ ++qs.i_ffn_up;
9095
+ }
9096
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9097
+ //}
9098
  // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
9099
  //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
9100
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
 
9149
  case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
9150
 
9151
  // K-quants
9152
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S:
9153
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9154
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
9155
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9156
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9157
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
 
9219
  ++qs.n_attention_wv;
9220
  }
9221
  else if (name.find("ffn_down") != std::string::npos) {
9222
+ ++qs.n_ffn_down;
9223
+ }
9224
+ else if (name.find("ffn_gate") != std::string::npos) {
9225
+ ++qs.n_ffn_gate;
9226
+ }
9227
+ else if (name.find("ffn_up") != std::string::npos) {
9228
+ ++qs.n_ffn_up;
9229
  }
9230
  }
9231
+ if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
9232
+ LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
9233
+ __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
9234
  }
9235
 
9236
  size_t total_size_org = 0;
 
10038
  ctx->embedding.resize(hparams.n_embd);
10039
  }
10040
 
10041
+ // graph inputs
10042
+ {
10043
+ ggml_init_params init_params = {
10044
+ /* .mem_size */ ggml_tensor_overhead()*5,
10045
+ /* .mem_buffer */ nullptr,
10046
+ /* .no_alloc */ true,
10047
+ };
10048
+ ctx->ctx_input = ggml_init(init_params);
10049
+
10050
+ ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
10051
+ ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
10052
+ ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
10053
+ ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
10054
+ ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
10055
+
10056
+ ggml_set_name(ctx->inp_tokens, "inp_tokens");
10057
+ ggml_set_name(ctx->inp_embd, "inp_embd");
10058
+ ggml_set_name(ctx->inp_pos, "inp_pos");
10059
+ ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
10060
+ ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
10061
+
10062
+ ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
10063
+
10064
+ LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
10065
+ ggml_backend_buffer_name(ctx->buf_input),
10066
+ ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
10067
+ }
10068
+
10069
+ // scheduler and compute buffers
10070
  {
10071
  // buffer types used for the compute buffer of each backend
10072
  std::vector<ggml_backend_buffer_type_t> backend_buft;
 
10093
 
10094
  // initialize scheduler with the worst-case graph
10095
  ggml_backend_sched_init_measure(ctx->sched, gf);
 
 
 
10096
  ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
10097
 
10098
  for (ggml_backend_t backend : ctx->backends) {
 
10101
  ggml_backend_buffer_name(buf),
10102
  ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
10103
  }
10104
+
10105
+ // note: the number of splits during measure is higher than during inference due to the kv shift
10106
+ int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
10107
+ LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
10108
  }
10109
  }
10110
 
examples/talk-llama/llama.h CHANGED
@@ -107,6 +107,7 @@ extern "C" {
107
  LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
108
  LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
109
  LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
 
110
 
111
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
112
  };
@@ -774,6 +775,14 @@ extern "C" {
774
  float p,
775
  size_t min_keep);
776
 
 
 
 
 
 
 
 
 
777
  LLAMA_API void llama_sample_temp(
778
  struct llama_context * ctx,
779
  llama_token_data_array * candidates,
 
107
  LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
108
  LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
109
  LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
110
+ LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
111
 
112
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
113
  };
 
775
  float p,
776
  size_t min_keep);
777
 
778
+ /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
779
+ LLAMA_API void llama_sample_entropy(
780
+ struct llama_context * ctx,
781
+ llama_token_data_array * candidates_p,
782
+ float min_temp,
783
+ float max_temp,
784
+ float exponent_val);
785
+
786
  LLAMA_API void llama_sample_temp(
787
  struct llama_context * ctx,
788
  llama_token_data_array * candidates,
examples/talk-llama/unicode.h CHANGED
@@ -2,8 +2,9 @@
2
 
3
  #include <cassert>
4
  #include <stdexcept>
5
- #include <vector>
6
  #include <unordered_map>
 
7
 
8
  static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
9
  {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
 
2
 
3
  #include <cassert>
4
  #include <stdexcept>
5
+ #include <string>
6
  #include <unordered_map>
7
+ #include <vector>
8
 
9
  static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
10
  {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},