ggerganov commited on
Commit
c9ddda2
·
1 Parent(s): 43d5a06

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama-vocab.cpp CHANGED
@@ -50,7 +50,7 @@ struct naive_trie {
50
  res.first->second.insert(key + 1, len - 1, value);
51
  }
52
  }
53
- std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
54
  if (len == 0 || offset == len) {
55
  return std::make_pair(key, offset);
56
  }
@@ -79,6 +79,15 @@ struct naive_trie {
79
  // impl
80
  //
81
 
 
 
 
 
 
 
 
 
 
82
  int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
83
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
84
  GGML_ASSERT(token_left.find('\n') == std::string::npos);
@@ -187,10 +196,15 @@ struct llm_bigram_spm {
187
  size_t size;
188
  };
189
 
190
- struct llm_tokenizer_spm {
191
- llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
 
 
 
 
192
 
193
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
 
194
  // split string into utf8 chars
195
  int index = 0;
196
  size_t offs = 0;
@@ -271,7 +285,7 @@ private:
271
  return;
272
  }
273
 
274
- resegment(symbols[p->second.first], output);
275
  resegment(symbols[p->second.second], output);
276
  }
277
 
@@ -279,7 +293,6 @@ private:
279
  if (left == -1 || right == -1) {
280
  return;
281
  }
282
-
283
  const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
284
  auto token = vocab.token_to_id.find(text);
285
 
@@ -306,10 +319,11 @@ private:
306
  }
307
 
308
  const llama_vocab & vocab;
 
 
309
 
310
  std::vector<llm_symbol> symbols;
311
  llm_bigram_spm::queue work_queue;
312
-
313
  std::map<std::string, std::pair<int, int>> rev_merge;
314
  };
315
 
@@ -352,8 +366,8 @@ struct llm_bigram_bpe {
352
  size_t size;
353
  };
354
 
355
- struct llm_tokenizer_bpe {
356
- llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
357
  GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
358
  switch (vocab.type_pre) {
359
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
@@ -450,6 +464,20 @@ struct llm_tokenizer_bpe {
450
  "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
451
  };
452
  break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  default:
454
  // default regex for BPE tokenization pre-processing
455
  regex_exprs = {
@@ -462,7 +490,14 @@ struct llm_tokenizer_bpe {
462
  }
463
  }
464
 
465
- void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
 
 
 
 
 
 
 
466
  output.push_back(token_id);
467
  }
468
 
@@ -501,12 +536,11 @@ struct llm_tokenizer_bpe {
501
 
502
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
503
  int final_prev_index = -1;
504
-
505
- const auto word_collection = unicode_regex_split(text, regex_exprs);
506
 
507
  symbols_final.clear();
508
 
509
- for (auto & word : word_collection) {
510
  work_queue = llm_bigram_bpe::queue();
511
  symbols.clear();
512
 
@@ -609,7 +643,6 @@ private:
609
  if (left == -1 || right == -1) {
610
  return;
611
  }
612
-
613
  std::string left_token = std::string(symbols[left].text, symbols[left].n);
614
  std::string right_token = std::string(symbols[right].text, symbols[right].n);
615
 
@@ -633,12 +666,10 @@ private:
633
  }
634
 
635
  const llama_vocab & vocab;
636
-
637
- std::vector<std::string> regex_exprs;
638
 
639
  std::vector<llm_symbol> symbols;
640
  std::vector<llm_symbol> symbols_final;
641
-
642
  llm_bigram_bpe::queue work_queue;
643
  };
644
 
@@ -646,15 +677,17 @@ private:
646
  // WPM tokenizer
647
  //
648
 
649
- struct llm_tokenizer_wpm {
650
- llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
 
651
 
652
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
653
- const auto & token_map = vocab.token_to_id;
654
 
 
 
655
  // normalize and split by whitespace
656
  std::vector<std::string> words = preprocess(text);
657
-
658
  // bos token prepended already
659
 
660
  // find the longest tokens that form the words
@@ -699,7 +732,7 @@ struct llm_tokenizer_wpm {
699
  }
700
 
701
  // TODO: reduce string copies by using cpts_offs array
702
- std::vector<std::string> preprocess(const std::string & text) const {
703
  const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
704
  std::vector<std::string> words(1, "");
705
 
@@ -751,15 +784,18 @@ struct llm_tokenizer_wpm {
751
  //(cpt >= 0xFF00 && cpt <= 0xFFEF);
752
  }
753
 
 
754
  const llama_vocab & vocab;
 
 
755
  };
756
 
757
  //
758
  // UGM tokenizer
759
  //
760
 
761
- struct llm_tokenizer_ugm {
762
- llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) {
763
  if (vocab.precompiled_charsmap.size() > 0) {
764
  size_t charsmap_offset = 0;
765
 
@@ -805,6 +841,30 @@ struct llm_tokenizer_ugm {
805
  unknown_token_score = min_score - unknown_token_score_penalty;
806
  }
807
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808
  /* This implementation is based on SentencePiece optimized Viterbi algorithm for
809
  * unigram language models. The general idea is to:
810
  * - move along the input sequence in steps of one UTF code point,
@@ -843,7 +903,7 @@ struct llm_tokenizer_ugm {
843
  // traverse the token matcher trie to find a matching token
844
  bool single_codepoint_token_found = false;
845
  const struct best_tokenization & current_best = tokenization_results[input_offset];
846
- const struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]);
847
 
848
  while (prefix_offset <= input_len && node != NULL) {
849
  // check if we found valid token in prefix
@@ -873,7 +933,7 @@ struct llm_tokenizer_ugm {
873
  // if we didn't find a valid token corresponding to the whole UTF code point
874
  // then use unknown token as the tokenization of this UTF code point
875
  if (!single_codepoint_token_found) {
876
- const double challenger_score = current_best.score_sum + unknown_token_score;
877
  prefix_offset = input_offset + n_utf8_code_units;
878
  struct best_tokenization & current_champ = tokenization_results[prefix_offset];
879
  if (challenger_score > current_champ.score_sum) {
@@ -905,7 +965,6 @@ struct llm_tokenizer_ugm {
905
  }
906
 
907
  private:
908
- const llama_vocab & vocab;
909
 
910
  // helper structure for returning normalization results
911
  struct normalization_result {
@@ -918,7 +977,7 @@ private:
918
  normalized->clear();
919
  normalized->reserve(input.size() * 3);
920
 
921
- const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " ";
922
 
923
  bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
924
  bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
@@ -1000,13 +1059,21 @@ private:
1000
  size_t xcda_array_size;
1001
  };
1002
 
 
 
 
 
 
 
 
1003
  struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
1004
  if (input_offset == input.size()) {
1005
  return { &input[input_offset], 0, 0 };
1006
  }
1007
 
1008
  // if input prefix matches some user-defined token return this token as normalization result
1009
- auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
 
1010
  if (user_defined_token_match.second > 0) {
1011
  return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
1012
  }
@@ -1014,8 +1081,8 @@ private:
1014
  size_t longest_prefix_length = 0;
1015
  size_t longest_prefix_offset = 0;
1016
 
1017
- if (xcda_array_size > 0) {
1018
- struct xcda_array_view xcda_view(xcda_array, xcda_array_size);
1019
 
1020
  // Find the longest normalized sequence matching the input prefix by walking
1021
  // the XOR-compressed compact double array (XCDA) starting from the root node
@@ -1051,50 +1118,27 @@ private:
1051
 
1052
  if (longest_prefix_length > 0) {
1053
  // we have a match, so return the replacement sequence
1054
- if (longest_prefix_offset >= prefix_replacements_size) {
1055
  throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
1056
  }
1057
- const char * prefix_replacement = &prefix_replacements[longest_prefix_offset];
1058
  return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
1059
- } else {
1060
- // check if the input prefix contains a valid sequence of UTF-8 code units
1061
- try {
1062
- // if yes, return this sequence unmodified
1063
- size_t prefix_offset = input_offset;
1064
- unicode_cpt_from_utf8(input, prefix_offset);
1065
- return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
1066
- } catch (std::invalid_argument & /*ex*/) {
1067
- // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
1068
- return { "\xEF\xBF\xBD", 3, 1 };
1069
- }
1070
  }
1071
- }
1072
-
1073
- // escaped space symbol - U+2581 (Lower One Eighth Block)
1074
- const std::string escaped_space = "\xE2\x96\x81";
1075
 
1076
- const char * prefix_replacements = NULL;
1077
- size_t prefix_replacements_size = 0;
1078
-
1079
- const uint32_t * xcda_array = NULL;
1080
- size_t xcda_array_size = 0;
1081
-
1082
- struct naive_trie user_defined_token_matcher;
1083
-
1084
- // this structure stores the best tokenization so far at input_offset
1085
- struct best_tokenization {
1086
- llama_token token_id;
1087
- size_t input_offset;
1088
- float score_sum;
1089
- };
1090
-
1091
- float min_score = FLT_MAX;
1092
- float max_score = -FLT_MAX;
1093
-
1094
- float unknown_token_score_penalty = 10.0;
1095
- float unknown_token_score;
1096
 
1097
- struct naive_trie token_matcher;
 
1098
  };
1099
 
1100
  //
@@ -1155,8 +1199,8 @@ static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escape
1155
  return output;
1156
  }
1157
 
1158
- struct llm_tokenizer_rwkv {
1159
- llm_tokenizer_rwkv(const llama_vocab & vocab): vocab(vocab) {
1160
  // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
1161
  // For now, we decode the vocab here into the lookup we'll use for tokenization.
1162
 
@@ -1168,11 +1212,17 @@ struct llm_tokenizer_rwkv {
1168
  }
1169
  }
1170
 
 
 
 
 
 
 
 
1171
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
1172
  uint32_t position = 0;
1173
-
1174
  while (position < text.size()) {
1175
- const struct naive_trie * node = token_matcher.traverse(text[position]);
1176
  if (node == NULL) {
1177
  // no matching token found, add unknown token
1178
  output.push_back(vocab.special_unk_id);
@@ -1197,11 +1247,33 @@ struct llm_tokenizer_rwkv {
1197
  }
1198
  }
1199
 
 
1200
  const llama_vocab & vocab;
1201
-
1202
- struct naive_trie token_matcher;
1203
  };
1204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1205
  //
1206
  // (de-) tokenize
1207
  //
@@ -1263,7 +1335,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
1263
 
1264
  // if a fragment is text ( not yet processed )
1265
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1266
- auto & raw_text = fragment.raw_text;
1267
 
1268
  auto raw_text_base_offset = fragment.offset;
1269
  auto raw_text_base_length = fragment.length;
@@ -1362,7 +1434,13 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
1362
  }
1363
  }
1364
 
1365
- std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
 
 
 
 
 
 
1366
  std::vector<llama_vocab::id> output;
1367
  std::forward_list<fragment_buffer_variant> fragment_buffer;
1368
 
@@ -1399,9 +1477,9 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
1399
  #ifdef PRETOKENIZERDEBUG
1400
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1401
  #endif
1402
- llm_tokenizer_spm tokenizer(vocab);
1403
  llama_escape_whitespace(raw_text);
1404
- tokenizer.tokenize(raw_text, output);
 
1405
  is_prev_special = false;
1406
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1407
  output.push_back(fragment.token);
@@ -1423,10 +1501,11 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
1423
  } break;
1424
  case LLAMA_VOCAB_TYPE_BPE:
1425
  {
1426
- llm_tokenizer_bpe tokenizer(vocab);
1427
-
 
1428
  if (add_special) {
1429
- tokenizer.append_bos(output);
1430
  }
1431
  for (const auto & fragment : fragment_buffer) {
1432
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -1435,15 +1514,15 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
1435
  #ifdef PRETOKENIZERDEBUG
1436
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1437
  #endif
1438
- tokenizer.tokenize(raw_text, output);
1439
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1440
- tokenizer.append(fragment.token, output);
1441
  }
1442
  }
1443
 
1444
  if (add_special) {
1445
- tokenizer.append_eos(output);
1446
- tokenizer.check_double_bos_eos(output);
1447
  }
1448
  } break;
1449
  case LLAMA_VOCAB_TYPE_WPM:
@@ -1453,7 +1532,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
1453
  output.push_back(vocab.special_cls_id);
1454
  }
1455
 
1456
- llm_tokenizer_wpm tokenizer(vocab);
1457
 
1458
  for (const auto & fragment : fragment_buffer) {
1459
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -1462,7 +1541,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
1462
  #ifdef PRETOKENIZERDEBUG
1463
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1464
  #endif
1465
- tokenizer.tokenize(raw_text, output);
1466
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1467
  output.push_back(fragment.token);
1468
  }
@@ -1475,12 +1554,11 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
1475
  } break;
1476
  case LLAMA_VOCAB_TYPE_UGM:
1477
  {
1478
- llm_tokenizer_ugm tokenizer(vocab);
1479
-
1480
- if (add_special && vocab.tokenizer_add_bos != 0) {
1481
  GGML_ASSERT(vocab.special_bos_id != -1);
1482
  output.push_back(vocab.special_bos_id);
1483
  }
 
1484
 
1485
  for (const auto & fragment : fragment_buffer) {
1486
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -1488,26 +1566,27 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
1488
  #ifdef PRETOKENIZERDEBUG
1489
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1490
  #endif
1491
- tokenizer.tokenize(raw_text, output);
1492
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1493
  output.push_back(fragment.token);
1494
  }
1495
  }
1496
 
1497
- if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
1498
  LLAMA_LOG_WARN(
1499
  "%s: Added a BOS token to the prompt as specified by the model but the prompt "
1500
  "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
1501
  "Are you sure this is what you want?\n", __FUNCTION__);
1502
  }
1503
 
1504
- if (add_special && vocab.tokenizer_add_eos == 1) {
1505
  GGML_ASSERT(vocab.special_eos_id != -1);
1506
  output.push_back(vocab.special_eos_id);
1507
  }
1508
  } break;
1509
  case LLAMA_VOCAB_TYPE_RWKV:
1510
  {
 
1511
  for (const auto & fragment : fragment_buffer) {
1512
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1513
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -1516,8 +1595,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
1516
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1517
  #endif
1518
 
1519
- llm_tokenizer_rwkv tokenizer(vocab);
1520
- tokenizer.tokenize(raw_text, output);
1521
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1522
  output.push_back(fragment.token);
1523
  }
@@ -1630,13 +1708,13 @@ llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
1630
  }
1631
 
1632
  int32_t llama_tokenize_impl(
1633
- const struct llama_vocab & vocab,
1634
- const char * text,
1635
- int32_t text_len,
1636
- llama_token * tokens,
1637
- int32_t n_tokens_max,
1638
- bool add_special,
1639
- bool parse_special) {
1640
  auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
1641
  if (n_tokens_max < (int) res.size()) {
1642
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -1713,11 +1791,13 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
1713
  // suppressing them like CONTROL tokens.
1714
  if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
1715
  return _try_copy(token_text.data(), token_text.size());
1716
- } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
 
1717
  std::string result = token_text;
1718
  llama_unescape_whitespace(result);
1719
  return _try_copy(result.data(), result.size());
1720
- } else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
 
1721
  char byte = (char) llama_token_to_byte(vocab, token);
1722
  return _try_copy((char*) &byte, 1);
1723
  }
@@ -1728,7 +1808,8 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
1728
  // suppressing them like CONTROL tokens.
1729
  if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
1730
  return _try_copy(token_text.data(), token_text.size());
1731
- } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
 
1732
  std::string result = llama_decode_text(token_text);
1733
  return _try_copy(result.data(), result.size());
1734
  }
@@ -1761,6 +1842,8 @@ int32_t llama_detokenize_impl(
1761
  int32_t text_len_max,
1762
  bool remove_special,
1763
  bool unparse_special) {
 
 
1764
  int32_t avail = text_len_max;
1765
  int32_t total = 0;
1766
 
 
50
  res.first->second.insert(key + 1, len - 1, value);
51
  }
52
  }
53
+ std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
54
  if (len == 0 || offset == len) {
55
  return std::make_pair(key, offset);
56
  }
 
79
  // impl
80
  //
81
 
82
+ struct llm_tokenizer {
83
+ llm_tokenizer() {}
84
+ virtual ~llm_tokenizer() = default;
85
+ };
86
+
87
+ llama_vocab::~llama_vocab() {
88
+ delete tokenizer;
89
+ }
90
+
91
  int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
92
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
93
  GGML_ASSERT(token_left.find('\n') == std::string::npos);
 
196
  size_t size;
197
  };
198
 
199
+ struct llm_tokenizer_spm : llm_tokenizer {
200
+ llm_tokenizer_spm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
201
+ };
202
+
203
+ struct llm_tokenizer_spm_session {
204
+ llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
205
 
206
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
207
+
208
  // split string into utf8 chars
209
  int index = 0;
210
  size_t offs = 0;
 
285
  return;
286
  }
287
 
288
+ resegment(symbols[p->second.first], output);
289
  resegment(symbols[p->second.second], output);
290
  }
291
 
 
293
  if (left == -1 || right == -1) {
294
  return;
295
  }
 
296
  const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
297
  auto token = vocab.token_to_id.find(text);
298
 
 
319
  }
320
 
321
  const llama_vocab & vocab;
322
+ // currently unused
323
+ // const llm_tokenizer_spm * spm_tokenizer;
324
 
325
  std::vector<llm_symbol> symbols;
326
  llm_bigram_spm::queue work_queue;
 
327
  std::map<std::string, std::pair<int, int>> rev_merge;
328
  };
329
 
 
366
  size_t size;
367
  };
368
 
369
+ struct llm_tokenizer_bpe : llm_tokenizer {
370
+ llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer() {
371
  GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
372
  switch (vocab.type_pre) {
373
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
 
464
  "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
465
  };
466
  break;
467
+ case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
468
+ // Note: in theory, the special token (sentinel and image token) regex_exprs below
469
+ // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
470
+ // However, since the upstream pre-tokenizer uses them, they are also
471
+ // included here (see https://huggingface.co/facebook/chameleon-7b).
472
+ regex_exprs = {
473
+ "<sentinel:[0-9]+>", // Sentinel tokens
474
+ "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens
475
+ "([\\t\\n]| | )", // directly from tokenizer.json
476
+ "\\p{N}", // Individual digits
477
+ "[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated
478
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
479
+ };
480
+ break;
481
  default:
482
  // default regex for BPE tokenization pre-processing
483
  regex_exprs = {
 
490
  }
491
  }
492
 
493
+ std::vector<std::string> regex_exprs;
494
+ };
495
+
496
+ struct llm_tokenizer_bpe_session {
497
+ llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab),
498
+ bpe_tokenizer(static_cast<const llm_tokenizer_bpe *>(vocab.tokenizer)) {}
499
+
500
+ static void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) {
501
  output.push_back(token_id);
502
  }
503
 
 
536
 
537
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
538
  int final_prev_index = -1;
539
+ const auto word_collection = unicode_regex_split(text, bpe_tokenizer->regex_exprs);
 
540
 
541
  symbols_final.clear();
542
 
543
+ for (const auto & word : word_collection) {
544
  work_queue = llm_bigram_bpe::queue();
545
  symbols.clear();
546
 
 
643
  if (left == -1 || right == -1) {
644
  return;
645
  }
 
646
  std::string left_token = std::string(symbols[left].text, symbols[left].n);
647
  std::string right_token = std::string(symbols[right].text, symbols[right].n);
648
 
 
666
  }
667
 
668
  const llama_vocab & vocab;
669
+ const llm_tokenizer_bpe * bpe_tokenizer;
 
670
 
671
  std::vector<llm_symbol> symbols;
672
  std::vector<llm_symbol> symbols_final;
 
673
  llm_bigram_bpe::queue work_queue;
674
  };
675
 
 
677
  // WPM tokenizer
678
  //
679
 
680
+ struct llm_tokenizer_wpm : llm_tokenizer {
681
+ llm_tokenizer_wpm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
682
+ };
683
 
684
+ struct llm_tokenizer_wpm_session {
685
+ llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
686
 
687
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
688
+ const auto & token_map = vocab.token_to_id;
689
  // normalize and split by whitespace
690
  std::vector<std::string> words = preprocess(text);
 
691
  // bos token prepended already
692
 
693
  // find the longest tokens that form the words
 
732
  }
733
 
734
  // TODO: reduce string copies by using cpts_offs array
735
+ static std::vector<std::string> preprocess(const std::string & text) {
736
  const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
737
  std::vector<std::string> words(1, "");
738
 
 
784
  //(cpt >= 0xFF00 && cpt <= 0xFFEF);
785
  }
786
 
787
+ private:
788
  const llama_vocab & vocab;
789
+ // currently unused
790
+ // const llm_tokenizer_wpm * wpm_tokenizer;
791
  };
792
 
793
  //
794
  // UGM tokenizer
795
  //
796
 
797
+ struct llm_tokenizer_ugm : llm_tokenizer {
798
+ llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer() {
799
  if (vocab.precompiled_charsmap.size() > 0) {
800
  size_t charsmap_offset = 0;
801
 
 
841
  unknown_token_score = min_score - unknown_token_score_penalty;
842
  }
843
 
844
+ // escaped space symbol - U+2581 (Lower One Eighth Block)
845
+ const std::string escaped_space = "\xE2\x96\x81";
846
+
847
+ const char * prefix_replacements = NULL;
848
+ size_t prefix_replacements_size = 0;
849
+
850
+ const uint32_t * xcda_array = NULL;
851
+ size_t xcda_array_size = 0;
852
+
853
+ struct naive_trie user_defined_token_matcher;
854
+
855
+ float min_score = FLT_MAX;
856
+ float max_score = -FLT_MAX;
857
+
858
+ float unknown_token_score_penalty = 10.0;
859
+ float unknown_token_score;
860
+
861
+ struct naive_trie token_matcher;
862
+ };
863
+
864
+ struct llm_tokenizer_ugm_session {
865
+ llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab),
866
+ ugm_tokenizer(static_cast<const llm_tokenizer_ugm *>(vocab.tokenizer)) {}
867
+
868
  /* This implementation is based on SentencePiece optimized Viterbi algorithm for
869
  * unigram language models. The general idea is to:
870
  * - move along the input sequence in steps of one UTF code point,
 
903
  // traverse the token matcher trie to find a matching token
904
  bool single_codepoint_token_found = false;
905
  const struct best_tokenization & current_best = tokenization_results[input_offset];
906
+ const struct naive_trie * node = ugm_tokenizer->token_matcher.traverse(normalized[prefix_offset++]);
907
 
908
  while (prefix_offset <= input_len && node != NULL) {
909
  // check if we found valid token in prefix
 
933
  // if we didn't find a valid token corresponding to the whole UTF code point
934
  // then use unknown token as the tokenization of this UTF code point
935
  if (!single_codepoint_token_found) {
936
+ const double challenger_score = current_best.score_sum + ugm_tokenizer->unknown_token_score;
937
  prefix_offset = input_offset + n_utf8_code_units;
938
  struct best_tokenization & current_champ = tokenization_results[prefix_offset];
939
  if (challenger_score > current_champ.score_sum) {
 
965
  }
966
 
967
  private:
 
968
 
969
  // helper structure for returning normalization results
970
  struct normalization_result {
 
977
  normalized->clear();
978
  normalized->reserve(input.size() * 3);
979
 
980
+ const std::string space = vocab.tokenizer_escape_whitespaces ? ugm_tokenizer->escaped_space : " ";
981
 
982
  bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
983
  bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
 
1059
  size_t xcda_array_size;
1060
  };
1061
 
1062
+ // this structure stores the best tokenization so far at input_offset
1063
+ struct best_tokenization {
1064
+ llama_token token_id;
1065
+ size_t input_offset;
1066
+ float score_sum;
1067
+ };
1068
+
1069
  struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
1070
  if (input_offset == input.size()) {
1071
  return { &input[input_offset], 0, 0 };
1072
  }
1073
 
1074
  // if input prefix matches some user-defined token return this token as normalization result
1075
+ auto user_defined_token_match =
1076
+ ugm_tokenizer->user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
1077
  if (user_defined_token_match.second > 0) {
1078
  return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
1079
  }
 
1081
  size_t longest_prefix_length = 0;
1082
  size_t longest_prefix_offset = 0;
1083
 
1084
+ if (ugm_tokenizer->xcda_array_size > 0) {
1085
+ struct xcda_array_view xcda_view(ugm_tokenizer->xcda_array, ugm_tokenizer->xcda_array_size);
1086
 
1087
  // Find the longest normalized sequence matching the input prefix by walking
1088
  // the XOR-compressed compact double array (XCDA) starting from the root node
 
1118
 
1119
  if (longest_prefix_length > 0) {
1120
  // we have a match, so return the replacement sequence
1121
+ if (longest_prefix_offset >= ugm_tokenizer->prefix_replacements_size) {
1122
  throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
1123
  }
1124
+ const char * prefix_replacement = &(ugm_tokenizer->prefix_replacements)[longest_prefix_offset];
1125
  return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
 
 
 
 
 
 
 
 
 
 
 
1126
  }
 
 
 
 
1127
 
1128
+ // check if the input prefix contains a valid sequence of UTF-8 code units
1129
+ try {
1130
+ // if yes, return this sequence unmodified
1131
+ size_t prefix_offset = input_offset;
1132
+ unicode_cpt_from_utf8(input, prefix_offset);
1133
+ return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
1134
+ } catch (std::invalid_argument & /*ex*/) {
1135
+ // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
1136
+ return { "\xEF\xBF\xBD", 3, 1 };
1137
+ }
1138
+ }
 
 
 
 
 
 
 
 
 
1139
 
1140
+ const llama_vocab & vocab;
1141
+ const llm_tokenizer_ugm * ugm_tokenizer;
1142
  };
1143
 
1144
  //
 
1199
  return output;
1200
  }
1201
 
1202
+ struct llm_tokenizer_rwkv : llm_tokenizer {
1203
+ llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer() {
1204
  // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
1205
  // For now, we decode the vocab here into the lookup we'll use for tokenization.
1206
 
 
1212
  }
1213
  }
1214
 
1215
+ struct naive_trie token_matcher;
1216
+ };
1217
+
1218
+ struct llm_tokenizer_rwkv_session {
1219
+ llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab),
1220
+ rwkv_tokenizer(static_cast<const llm_tokenizer_rwkv &>(*vocab.tokenizer)) {}
1221
+
1222
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
1223
  uint32_t position = 0;
 
1224
  while (position < text.size()) {
1225
+ const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]);
1226
  if (node == NULL) {
1227
  // no matching token found, add unknown token
1228
  output.push_back(vocab.special_unk_id);
 
1247
  }
1248
  }
1249
 
1250
+ private:
1251
  const llama_vocab & vocab;
1252
+ const llm_tokenizer_rwkv & rwkv_tokenizer;
 
1253
  };
1254
 
1255
+ void llama_vocab::init_tokenizer() {
1256
+ switch (type) {
1257
+ case LLAMA_VOCAB_TYPE_SPM:
1258
+ tokenizer = new llm_tokenizer_spm(*this);
1259
+ break;
1260
+ case LLAMA_VOCAB_TYPE_BPE:
1261
+ tokenizer = new llm_tokenizer_bpe(*this);
1262
+ break;
1263
+ case LLAMA_VOCAB_TYPE_WPM:
1264
+ tokenizer = new llm_tokenizer_wpm(*this);
1265
+ break;
1266
+ case LLAMA_VOCAB_TYPE_UGM:
1267
+ tokenizer = new llm_tokenizer_ugm(*this);
1268
+ break;
1269
+ case LLAMA_VOCAB_TYPE_RWKV:
1270
+ tokenizer = new llm_tokenizer_rwkv(*this);
1271
+ break;
1272
+ default:
1273
+ GGML_ABORT("unsupported vocab type");
1274
+ }
1275
+ }
1276
+
1277
  //
1278
  // (de-) tokenize
1279
  //
 
1335
 
1336
  // if a fragment is text ( not yet processed )
1337
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1338
+ const auto & raw_text = fragment.raw_text;
1339
 
1340
  auto raw_text_base_offset = fragment.offset;
1341
  auto raw_text_base_length = fragment.length;
 
1434
  }
1435
  }
1436
 
1437
+ std::vector<llama_vocab::id> llama_tokenize_internal(
1438
+ const llama_vocab & vocab,
1439
+ std::string raw_text,
1440
+ bool add_special,
1441
+ bool parse_special) {
1442
+ GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
1443
+
1444
  std::vector<llama_vocab::id> output;
1445
  std::forward_list<fragment_buffer_variant> fragment_buffer;
1446
 
 
1477
  #ifdef PRETOKENIZERDEBUG
1478
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1479
  #endif
 
1480
  llama_escape_whitespace(raw_text);
1481
+ llm_tokenizer_spm_session session(vocab);
1482
+ session.tokenize(raw_text, output);
1483
  is_prev_special = false;
1484
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1485
  output.push_back(fragment.token);
 
1501
  } break;
1502
  case LLAMA_VOCAB_TYPE_BPE:
1503
  {
1504
+ llm_tokenizer_bpe_session session(vocab);
1505
+ // it calls some other methods that are not exist in llm_tokenizer,
1506
+ // here just cast it to bpe tokenizer object
1507
  if (add_special) {
1508
+ session.append_bos(output);
1509
  }
1510
  for (const auto & fragment : fragment_buffer) {
1511
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
 
1514
  #ifdef PRETOKENIZERDEBUG
1515
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1516
  #endif
1517
+ session.tokenize(raw_text, output);
1518
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1519
+ session.append(fragment.token, output);
1520
  }
1521
  }
1522
 
1523
  if (add_special) {
1524
+ session.append_eos(output);
1525
+ session.check_double_bos_eos(output);
1526
  }
1527
  } break;
1528
  case LLAMA_VOCAB_TYPE_WPM:
 
1532
  output.push_back(vocab.special_cls_id);
1533
  }
1534
 
1535
+ llm_tokenizer_wpm_session session(vocab);
1536
 
1537
  for (const auto & fragment : fragment_buffer) {
1538
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
 
1541
  #ifdef PRETOKENIZERDEBUG
1542
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1543
  #endif
1544
+ session.tokenize(raw_text, output);
1545
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1546
  output.push_back(fragment.token);
1547
  }
 
1554
  } break;
1555
  case LLAMA_VOCAB_TYPE_UGM:
1556
  {
1557
+ if (add_special && vocab.tokenizer_add_bos) {
 
 
1558
  GGML_ASSERT(vocab.special_bos_id != -1);
1559
  output.push_back(vocab.special_bos_id);
1560
  }
1561
+ llm_tokenizer_ugm_session session(vocab);
1562
 
1563
  for (const auto & fragment : fragment_buffer) {
1564
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
 
1566
  #ifdef PRETOKENIZERDEBUG
1567
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1568
  #endif
1569
+ session.tokenize(raw_text, output);
1570
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1571
  output.push_back(fragment.token);
1572
  }
1573
  }
1574
 
1575
+ if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
1576
  LLAMA_LOG_WARN(
1577
  "%s: Added a BOS token to the prompt as specified by the model but the prompt "
1578
  "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
1579
  "Are you sure this is what you want?\n", __FUNCTION__);
1580
  }
1581
 
1582
+ if (add_special && vocab.tokenizer_add_eos) {
1583
  GGML_ASSERT(vocab.special_eos_id != -1);
1584
  output.push_back(vocab.special_eos_id);
1585
  }
1586
  } break;
1587
  case LLAMA_VOCAB_TYPE_RWKV:
1588
  {
1589
+ llm_tokenizer_rwkv_session session(vocab);
1590
  for (const auto & fragment : fragment_buffer) {
1591
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1592
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 
1595
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1596
  #endif
1597
 
1598
+ session.tokenize(raw_text, output);
 
1599
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1600
  output.push_back(fragment.token);
1601
  }
 
1708
  }
1709
 
1710
  int32_t llama_tokenize_impl(
1711
+ const struct llama_vocab & vocab,
1712
+ const char * text,
1713
+ int32_t text_len,
1714
+ llama_token * tokens,
1715
+ int32_t n_tokens_max,
1716
+ bool add_special,
1717
+ bool parse_special) {
1718
  auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
1719
  if (n_tokens_max < (int) res.size()) {
1720
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
 
1791
  // suppressing them like CONTROL tokens.
1792
  if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
1793
  return _try_copy(token_text.data(), token_text.size());
1794
+ }
1795
+ if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
1796
  std::string result = token_text;
1797
  llama_unescape_whitespace(result);
1798
  return _try_copy(result.data(), result.size());
1799
+ }
1800
+ if (attr & LLAMA_TOKEN_ATTR_BYTE) {
1801
  char byte = (char) llama_token_to_byte(vocab, token);
1802
  return _try_copy((char*) &byte, 1);
1803
  }
 
1808
  // suppressing them like CONTROL tokens.
1809
  if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
1810
  return _try_copy(token_text.data(), token_text.size());
1811
+ }
1812
+ if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
1813
  std::string result = llama_decode_text(token_text);
1814
  return _try_copy(result.data(), result.size());
1815
  }
 
1842
  int32_t text_len_max,
1843
  bool remove_special,
1844
  bool unparse_special) {
1845
+ GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
1846
+
1847
  int32_t avail = text_len_max;
1848
  int32_t total = 0;
1849
 
examples/talk-llama/llama-vocab.h CHANGED
@@ -8,6 +8,8 @@
8
  #include <map>
9
  #include <set>
10
 
 
 
11
  struct llama_vocab {
12
  using id = llama_token;
13
  using token = std::string;
@@ -65,7 +67,14 @@ struct llama_vocab {
65
 
66
  std::vector<char> precompiled_charsmap;
67
 
 
 
 
 
 
68
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
 
 
69
  };
70
 
71
  //
 
8
  #include <map>
9
  #include <set>
10
 
11
+ struct llm_tokenizer;
12
+
13
  struct llama_vocab {
14
  using id = llama_token;
15
  using token = std::string;
 
67
 
68
  std::vector<char> precompiled_charsmap;
69
 
70
+ llm_tokenizer * tokenizer = nullptr;
71
+
72
+ llama_vocab() = default;
73
+ ~llama_vocab();
74
+
75
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
76
+
77
+ void init_tokenizer();
78
  };
79
 
80
  //
examples/talk-llama/llama.cpp CHANGED
@@ -215,6 +215,8 @@ enum llm_arch {
215
  LLM_ARCH_EXAONE,
216
  LLM_ARCH_RWKV6,
217
  LLM_ARCH_GRANITE,
 
 
218
  LLM_ARCH_UNKNOWN,
219
  };
220
 
@@ -266,6 +268,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266
  { LLM_ARCH_EXAONE, "exaone" },
267
  { LLM_ARCH_RWKV6, "rwkv6" },
268
  { LLM_ARCH_GRANITE, "granite" },
 
 
269
  { LLM_ARCH_UNKNOWN, "(unknown)" },
270
  };
271
 
@@ -302,6 +306,7 @@ enum llm_kv {
302
  LLM_KV_DECODER_START_TOKEN_ID,
303
  LLM_KV_ATTN_LOGIT_SOFTCAPPING,
304
  LLM_KV_FINAL_LOGIT_SOFTCAPPING,
 
305
  LLM_KV_RESCALE_EVERY_N_LAYERS,
306
  LLM_KV_TIME_MIX_EXTRA_DIM,
307
  LLM_KV_TIME_DECAY_EXTRA_DIM,
@@ -409,6 +414,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
409
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
410
  { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
411
  { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
 
412
  { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
413
  { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
414
  { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
@@ -600,6 +606,8 @@ enum llm_tensor {
600
  LLM_TENSOR_ENC_FFN_DOWN,
601
  LLM_TENSOR_ENC_FFN_UP,
602
  LLM_TENSOR_ENC_OUTPUT_NORM,
 
 
603
  };
604
 
605
  static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -787,6 +795,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
787
  { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
788
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
789
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
 
 
790
  },
791
  },
792
  {
@@ -822,6 +832,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
822
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
823
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
824
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
 
825
  },
826
  },
827
  {
@@ -1467,6 +1478,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1467
  {
1468
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1469
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
 
1470
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1471
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1472
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
@@ -1478,6 +1490,43 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1478
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1479
  },
1480
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1481
  {
1482
  LLM_ARCH_UNKNOWN,
1483
  {
@@ -2341,6 +2390,7 @@ struct llama_hparams {
2341
  bool vocab_only;
2342
  bool rope_finetuned;
2343
  bool use_par_res;
 
2344
 
2345
  uint32_t n_vocab;
2346
  uint32_t n_ctx_train; // context size the model was trained on
@@ -2396,7 +2446,7 @@ struct llama_hparams {
2396
  float f_max_alibi_bias = 0.0f;
2397
  float f_logit_scale = 0.0f;
2398
 
2399
- // Additional scale factors (Granite)
2400
  float f_residual_scale = 0.0f;
2401
  float f_embedding_scale = 0.0f;
2402
  float f_attention_scale = 0.0f;
@@ -2849,6 +2899,7 @@ struct llama_model {
2849
  llama_hparams hparams = {};
2850
  llama_vocab vocab;
2851
 
 
2852
  struct ggml_tensor * tok_embd;
2853
  struct ggml_tensor * type_embd;
2854
  struct ggml_tensor * pos_embd;
@@ -2861,6 +2912,12 @@ struct llama_model {
2861
  struct ggml_tensor * output_b;
2862
  struct ggml_tensor * output_norm_enc;
2863
 
 
 
 
 
 
 
2864
  std::vector<llama_layer> layers;
2865
 
2866
  llama_split_mode split_mode;
@@ -5445,8 +5502,10 @@ static void llm_load_hparams(
5445
  }
5446
  } else {
5447
  switch (hparams.n_layer) {
 
5448
  case 22: model.type = e_model::MODEL_1B; break;
5449
  case 26: model.type = e_model::MODEL_3B; break;
 
5450
  // granite uses a vocab with len 49152
5451
  case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
5452
  case 36: model.type = e_model::MODEL_8B; break; // granite
@@ -5559,11 +5618,11 @@ static void llm_load_hparams(
5559
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
5560
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
5561
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
5562
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
5563
  hparams.f_max_alibi_bias = 8.0f;
5564
 
5565
  switch (hparams.n_layer) {
5566
- case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
5567
  case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
5568
  }
5569
  } break;
@@ -6048,6 +6107,7 @@ static void llm_load_hparams(
6048
  }
6049
  } break;
6050
  case LLM_ARCH_GRANITE:
 
6051
  {
6052
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6053
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -6056,11 +6116,24 @@ static void llm_load_hparams(
6056
  ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
6057
 
6058
  switch (hparams.n_layer) {
 
6059
  case 40: model.type = e_model::MODEL_3B; break;
6060
  // Add additional layer/vocab/etc checks here for other model sizes
6061
  default: model.type = e_model::MODEL_UNKNOWN;
6062
  }
6063
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
6064
  default: (void)0;
6065
  }
6066
 
@@ -6254,6 +6327,7 @@ static void llm_load_vocab(
6254
  tokenizer_pre == "phi-2" ||
6255
  tokenizer_pre == "jina-es" ||
6256
  tokenizer_pre == "jina-de" ||
 
6257
  tokenizer_pre == "jina-v2-es" ||
6258
  tokenizer_pre == "jina-v2-de" ||
6259
  tokenizer_pre == "jina-v2-code") {
@@ -6318,6 +6392,11 @@ static void llm_load_vocab(
6318
  } else if (
6319
  tokenizer_pre == "exaone") {
6320
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
 
 
 
 
 
6321
  } else {
6322
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
6323
  }
@@ -6375,7 +6454,12 @@ static void llm_load_vocab(
6375
 
6376
  for (uint32_t i = 0; i < n_vocab; i++) {
6377
  std::string word = gguf_get_arr_str(ctx, token_idx, i);
6378
- GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
 
 
 
 
 
6379
 
6380
  vocab.token_to_id[word] = i;
6381
  vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
@@ -6400,6 +6484,8 @@ static void llm_load_vocab(
6400
  }
6401
  GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
6402
 
 
 
6403
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
6404
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
6405
  // For Fill-In-the-Middle (FIM)/infill models which where converted
@@ -6454,8 +6540,14 @@ static void llm_load_vocab(
6454
  vocab.linefeed_id = ids[0];
6455
  } else {
6456
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
6457
- GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
6458
- vocab.linefeed_id = ids[0];
 
 
 
 
 
 
6459
  }
6460
 
6461
  // special tokens
@@ -6810,7 +6902,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6810
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6811
  }
6812
 
6813
- if (model.arch == LLM_ARCH_GRANITE) {
6814
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6815
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6816
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -6984,6 +7076,7 @@ static bool llm_load_tensors(
6984
  case LLM_ARCH_REFACT:
6985
  case LLM_ARCH_MINICPM:
6986
  case LLM_ARCH_GRANITE:
 
6987
  {
6988
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6989
 
@@ -7327,6 +7420,12 @@ static bool llm_load_tensors(
7327
 
7328
  if (model.arch == LLM_ARCH_BERT) {
7329
  model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
 
 
 
 
 
 
7330
  }
7331
 
7332
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
@@ -7379,6 +7478,8 @@ static bool llm_load_tensors(
7379
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
7380
  model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
7381
 
 
 
7382
  for (int i = 0; i < n_layer; ++i) {
7383
  ggml_context * ctx_layer = ctx_for_layer(i);
7384
  ggml_context * ctx_split = ctx_for_layer_split(i);
@@ -8704,6 +8805,45 @@ static bool llm_load_tensors(
8704
  }
8705
 
8706
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8707
  default:
8708
  throw std::runtime_error("unknown architecture");
8709
  }
@@ -10173,6 +10313,10 @@ struct llm_build_context {
10173
  struct ggml_tensor * cur;
10174
 
10175
  switch (pooling_type) {
 
 
 
 
10176
  case LLAMA_POOLING_TYPE_MEAN:
10177
  {
10178
  struct ggml_tensor * inp_mean = build_inp_mean();
@@ -10184,9 +10328,26 @@ struct llm_build_context {
10184
  struct ggml_tensor * inp_cls = build_inp_cls();
10185
  cur = ggml_get_rows(ctx0, inp, inp_cls);
10186
  } break;
10187
- case LLAMA_POOLING_TYPE_NONE:
10188
  {
10189
- cur = inp;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10190
  } break;
10191
  default:
10192
  {
@@ -11415,8 +11576,8 @@ struct llm_build_context {
11415
  inpL = cur;
11416
  }
11417
 
11418
- // final output
11419
  cur = inpL;
 
11420
  cb(cur, "result_embd", -1);
11421
 
11422
  ggml_build_forward_expand(gf, cur);
@@ -15848,6 +16009,184 @@ struct llm_build_context {
15848
 
15849
  return gf;
15850
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15851
  };
15852
 
15853
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -15930,6 +16269,7 @@ static struct ggml_cgraph * llama_build_graph(
15930
  switch (model.arch) {
15931
  case LLM_ARCH_LLAMA:
15932
  case LLM_ARCH_GRANITE:
 
15933
  {
15934
  result = llm.build_llama();
15935
  } break;
@@ -16107,6 +16447,10 @@ static struct ggml_cgraph * llama_build_graph(
16107
  {
16108
  result = llm.build_rwkv6();
16109
  } break;
 
 
 
 
16110
  default:
16111
  GGML_ABORT("fatal error");
16112
  }
@@ -16393,7 +16737,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
16393
  }
16394
  }
16395
 
16396
- if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
 
 
16397
  const int64_t n_tokens = batch.n_tokens;
16398
  const int64_t n_seq_tokens = batch.n_seq_tokens;
16399
  const int64_t n_seqs = batch.n_seqs;
@@ -16408,7 +16754,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
16408
  const llama_seq_id seq_id = batch.seq_id[s][0];
16409
 
16410
  // TODO: adapt limits to n_seqs when batch.equal_seqs is true
16411
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
16412
 
16413
  for (int i = 0; i < n_seq_tokens; ++i) {
16414
  const llama_pos pos = batch.pos[s*n_seq_tokens + i];
@@ -16679,12 +17025,6 @@ static void llama_graph_compute(
16679
  ggml_cgraph * gf,
16680
  int n_threads,
16681
  ggml_threadpool * threadpool) {
16682
- #ifdef GGML_USE_METAL
16683
- if (ggml_backend_is_metal(lctx.backend_metal)) {
16684
- ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
16685
- }
16686
- #endif
16687
-
16688
  if (lctx.backend_cpu != nullptr) {
16689
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
16690
  ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
@@ -16948,6 +17288,20 @@ static int llama_decode_internal(
16948
  ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
16949
  }
16950
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16951
  case LLAMA_POOLING_TYPE_UNSPECIFIED:
16952
  {
16953
  GGML_ABORT("unknown pooling type");
@@ -17154,6 +17508,13 @@ static int llama_encode_internal(
17154
  ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
17155
  }
17156
  } break;
 
 
 
 
 
 
 
17157
  case LLAMA_POOLING_TYPE_UNSPECIFIED:
17158
  {
17159
  GGML_ABORT("unknown pooling type");
@@ -19231,6 +19592,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19231
  case LLM_ARCH_DEEPSEEK2:
19232
  case LLM_ARCH_CHATGLM:
19233
  case LLM_ARCH_GRANITE:
 
 
19234
  return LLAMA_ROPE_TYPE_NORM;
19235
 
19236
  // the pairs of head values are offset by n_rot/2
 
215
  LLM_ARCH_EXAONE,
216
  LLM_ARCH_RWKV6,
217
  LLM_ARCH_GRANITE,
218
+ LLM_ARCH_GRANITE_MOE,
219
+ LLM_ARCH_CHAMELEON,
220
  LLM_ARCH_UNKNOWN,
221
  };
222
 
 
268
  { LLM_ARCH_EXAONE, "exaone" },
269
  { LLM_ARCH_RWKV6, "rwkv6" },
270
  { LLM_ARCH_GRANITE, "granite" },
271
+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
272
+ { LLM_ARCH_CHAMELEON, "chameleon" },
273
  { LLM_ARCH_UNKNOWN, "(unknown)" },
274
  };
275
 
 
306
  LLM_KV_DECODER_START_TOKEN_ID,
307
  LLM_KV_ATTN_LOGIT_SOFTCAPPING,
308
  LLM_KV_FINAL_LOGIT_SOFTCAPPING,
309
+ LLM_KV_SWIN_NORM,
310
  LLM_KV_RESCALE_EVERY_N_LAYERS,
311
  LLM_KV_TIME_MIX_EXTRA_DIM,
312
  LLM_KV_TIME_DECAY_EXTRA_DIM,
 
414
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
415
  { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
416
  { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
417
+ { LLM_KV_SWIN_NORM, "%s.swin_norm" },
418
  { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
419
  { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
420
  { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
 
606
  LLM_TENSOR_ENC_FFN_DOWN,
607
  LLM_TENSOR_ENC_FFN_UP,
608
  LLM_TENSOR_ENC_OUTPUT_NORM,
609
+ LLM_TENSOR_CLS,
610
+ LLM_TENSOR_CLS_OUT,
611
  };
612
 
613
  static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
 
795
  { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
796
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
797
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
798
+ { LLM_TENSOR_CLS, "cls" },
799
+ { LLM_TENSOR_CLS_OUT, "cls.output" },
800
  },
801
  },
802
  {
 
832
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
833
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
834
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
835
+ { LLM_TENSOR_CLS, "cls" },
836
  },
837
  },
838
  {
 
1478
  {
1479
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1480
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1481
+ { LLM_TENSOR_OUTPUT, "output" },
1482
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1483
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1484
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
 
1490
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1491
  },
1492
  },
1493
+ {
1494
+ LLM_ARCH_GRANITE_MOE,
1495
+ {
1496
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1497
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1498
+ { LLM_TENSOR_OUTPUT, "output" },
1499
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1500
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1501
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1502
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1503
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1504
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1505
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1506
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1507
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1508
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1509
+ },
1510
+ },
1511
+ {
1512
+ LLM_ARCH_CHAMELEON,
1513
+ {
1514
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1515
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1516
+ { LLM_TENSOR_OUTPUT, "output" },
1517
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1518
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1519
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1520
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1521
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1522
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1523
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1524
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1525
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1526
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1527
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1528
+ },
1529
+ },
1530
  {
1531
  LLM_ARCH_UNKNOWN,
1532
  {
 
2390
  bool vocab_only;
2391
  bool rope_finetuned;
2392
  bool use_par_res;
2393
+ bool swin_norm;
2394
 
2395
  uint32_t n_vocab;
2396
  uint32_t n_ctx_train; // context size the model was trained on
 
2446
  float f_max_alibi_bias = 0.0f;
2447
  float f_logit_scale = 0.0f;
2448
 
2449
+ // Additional scale factors (Granite/Granite MoE)
2450
  float f_residual_scale = 0.0f;
2451
  float f_embedding_scale = 0.0f;
2452
  float f_attention_scale = 0.0f;
 
2899
  llama_hparams hparams = {};
2900
  llama_vocab vocab;
2901
 
2902
+ // TODO: should init all tensors to nullptr
2903
  struct ggml_tensor * tok_embd;
2904
  struct ggml_tensor * type_embd;
2905
  struct ggml_tensor * pos_embd;
 
2912
  struct ggml_tensor * output_b;
2913
  struct ggml_tensor * output_norm_enc;
2914
 
2915
+ // classifier
2916
+ struct ggml_tensor * cls;
2917
+ struct ggml_tensor * cls_b;
2918
+ struct ggml_tensor * cls_out = nullptr;
2919
+ struct ggml_tensor * cls_out_b = nullptr;
2920
+
2921
  std::vector<llama_layer> layers;
2922
 
2923
  llama_split_mode split_mode;
 
5502
  }
5503
  } else {
5504
  switch (hparams.n_layer) {
5505
+ case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
5506
  case 22: model.type = e_model::MODEL_1B; break;
5507
  case 26: model.type = e_model::MODEL_3B; break;
5508
+ case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
5509
  // granite uses a vocab with len 49152
5510
  case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
5511
  case 36: model.type = e_model::MODEL_8B; break; // granite
 
5618
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
5619
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
5620
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
5621
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
5622
  hparams.f_max_alibi_bias = 8.0f;
5623
 
5624
  switch (hparams.n_layer) {
5625
+ case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
5626
  case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
5627
  }
5628
  } break;
 
6107
  }
6108
  } break;
6109
  case LLM_ARCH_GRANITE:
6110
+ case LLM_ARCH_GRANITE_MOE:
6111
  {
6112
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6113
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
 
6116
  ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
6117
 
6118
  switch (hparams.n_layer) {
6119
+ case 32: model.type = e_model::MODEL_3B; break;
6120
  case 40: model.type = e_model::MODEL_3B; break;
6121
  // Add additional layer/vocab/etc checks here for other model sizes
6122
  default: model.type = e_model::MODEL_UNKNOWN;
6123
  }
6124
  } break;
6125
+ case LLM_ARCH_CHAMELEON:
6126
+ {
6127
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6128
+ hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
6129
+ ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
6130
+
6131
+ switch (hparams.n_layer) {
6132
+ case 32: model.type = e_model::MODEL_7B; break;
6133
+ case 48: model.type = e_model::MODEL_34B; break;
6134
+ default: model.type = e_model::MODEL_UNKNOWN;
6135
+ }
6136
+ } break;
6137
  default: (void)0;
6138
  }
6139
 
 
6327
  tokenizer_pre == "phi-2" ||
6328
  tokenizer_pre == "jina-es" ||
6329
  tokenizer_pre == "jina-de" ||
6330
+ tokenizer_pre == "jina-v1-en" ||
6331
  tokenizer_pre == "jina-v2-es" ||
6332
  tokenizer_pre == "jina-v2-de" ||
6333
  tokenizer_pre == "jina-v2-code") {
 
6392
  } else if (
6393
  tokenizer_pre == "exaone") {
6394
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
6395
+ } else if (
6396
+ tokenizer_pre == "chameleon") {
6397
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
6398
+ vocab.tokenizer_add_bos = true;
6399
+ vocab.tokenizer_clean_spaces = false;
6400
  } else {
6401
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
6402
  }
 
6454
 
6455
  for (uint32_t i = 0; i < n_vocab; i++) {
6456
  std::string word = gguf_get_arr_str(ctx, token_idx, i);
6457
+
6458
+ //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
6459
+ if (word.empty()) {
6460
+ LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
6461
+ word = "[EMPTY_" + std::to_string(i) + "]";
6462
+ }
6463
 
6464
  vocab.token_to_id[word] = i;
6465
  vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
 
6484
  }
6485
  GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
6486
 
6487
+ vocab.init_tokenizer();
6488
+
6489
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
6490
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
6491
  // For Fill-In-the-Middle (FIM)/infill models which where converted
 
6540
  vocab.linefeed_id = ids[0];
6541
  } else {
6542
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
6543
+
6544
+ //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
6545
+ if (ids.empty()) {
6546
+ LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
6547
+ vocab.linefeed_id = vocab.special_pad_id;
6548
+ } else {
6549
+ vocab.linefeed_id = ids[0];
6550
+ }
6551
  }
6552
 
6553
  // special tokens
 
6902
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6903
  }
6904
 
6905
+ if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
6906
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6907
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6908
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
 
7076
  case LLM_ARCH_REFACT:
7077
  case LLM_ARCH_MINICPM:
7078
  case LLM_ARCH_GRANITE:
7079
+ case LLM_ARCH_GRANITE_MOE:
7080
  {
7081
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
7082
 
 
7420
 
7421
  if (model.arch == LLM_ARCH_BERT) {
7422
  model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
7423
+
7424
+ model.cls = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7425
+ model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7426
+
7427
+ model.cls_out = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7428
+ model.cls_out_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7429
  }
7430
 
7431
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
 
7478
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
7479
  model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
7480
 
7481
+ model.cls = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7482
+ model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7483
  for (int i = 0; i < n_layer; ++i) {
7484
  ggml_context * ctx_layer = ctx_for_layer(i);
7485
  ggml_context * ctx_split = ctx_for_layer_split(i);
 
8805
  }
8806
 
8807
  } break;
8808
+ case LLM_ARCH_CHAMELEON:
8809
+ {
8810
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
8811
+
8812
+ // output
8813
+ {
8814
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
8815
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
8816
+
8817
+ // if output is NULL, init from the input tok embed
8818
+ if (model.output == NULL) {
8819
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
8820
+ }
8821
+ }
8822
+
8823
+ for (int i = 0; i < n_layer; ++i) {
8824
+ ggml_context * ctx_layer = ctx_for_layer(i);
8825
+ ggml_context * ctx_split = ctx_for_layer_split(i);
8826
+
8827
+ auto & layer = model.layers[i];
8828
+
8829
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
8830
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
8831
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
8832
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
8833
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
8834
+
8835
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
8836
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
8837
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
8838
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
8839
+
8840
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
8841
+
8842
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
8843
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
8844
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
8845
+ }
8846
+ } break;
8847
  default:
8848
  throw std::runtime_error("unknown architecture");
8849
  }
 
10313
  struct ggml_tensor * cur;
10314
 
10315
  switch (pooling_type) {
10316
+ case LLAMA_POOLING_TYPE_NONE:
10317
+ {
10318
+ cur = inp;
10319
+ } break;
10320
  case LLAMA_POOLING_TYPE_MEAN:
10321
  {
10322
  struct ggml_tensor * inp_mean = build_inp_mean();
 
10328
  struct ggml_tensor * inp_cls = build_inp_cls();
10329
  cur = ggml_get_rows(ctx0, inp, inp_cls);
10330
  } break;
10331
+ case LLAMA_POOLING_TYPE_RANK:
10332
  {
10333
+ struct ggml_tensor * inp_cls = build_inp_cls();
10334
+ inp = ggml_get_rows(ctx0, inp, inp_cls);
10335
+
10336
+ // classification head
10337
+ // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
10338
+ GGML_ASSERT(model.cls != nullptr);
10339
+ GGML_ASSERT(model.cls_b != nullptr);
10340
+
10341
+ cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
10342
+ cur = ggml_tanh(ctx0, cur);
10343
+
10344
+ // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
10345
+ // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
10346
+ if (model.cls_out) {
10347
+ GGML_ASSERT(model.cls_out_b != nullptr);
10348
+
10349
+ cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
10350
+ }
10351
  } break;
10352
  default:
10353
  {
 
11576
  inpL = cur;
11577
  }
11578
 
 
11579
  cur = inpL;
11580
+
11581
  cb(cur, "result_embd", -1);
11582
 
11583
  ggml_build_forward_expand(gf, cur);
 
16009
 
16010
  return gf;
16011
  }
16012
+
16013
+ // ref: https://github.com/facebookresearch/chameleon
16014
+ // based on the original build_llama() function, changes:
16015
+ // * qk-norm
16016
+ // * swin-norm
16017
+ // * removed bias
16018
+ // * removed MoE
16019
+ struct ggml_cgraph * build_chameleon() {
16020
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
16021
+
16022
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
16023
+ int32_t n_tokens = this->n_tokens;
16024
+
16025
+ const int64_t n_embd_head = hparams.n_embd_head_v;
16026
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
16027
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
16028
+
16029
+ struct ggml_tensor * cur;
16030
+ struct ggml_tensor * inpL;
16031
+
16032
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
16033
+
16034
+ // inp_pos - contains the positions
16035
+ struct ggml_tensor * inp_pos = build_inp_pos();
16036
+
16037
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
16038
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
16039
+
16040
+ for (int il = 0; il < n_layer; ++il) {
16041
+ struct ggml_tensor * inpSA = inpL;
16042
+
16043
+ // norm
16044
+ if (hparams.swin_norm) {
16045
+ cur = inpL;
16046
+ } else {
16047
+ cur = llm_build_norm(ctx0, inpL, hparams,
16048
+ model.layers[il].attn_norm, NULL,
16049
+ LLM_NORM_RMS, cb, il);
16050
+ cb(cur, "attn_norm", il);
16051
+ }
16052
+
16053
+ // self-attention
16054
+ {
16055
+ // compute Q and K and RoPE them
16056
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
16057
+ cb(Qcur, "Qcur", il);
16058
+
16059
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
16060
+ cb(Kcur, "Kcur", il);
16061
+
16062
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
16063
+ cb(Vcur, "Vcur", il);
16064
+
16065
+ if (model.layers[il].attn_q_norm) {
16066
+ Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
16067
+ ggml_element_size(Qcur) * n_embd_head,
16068
+ ggml_element_size(Qcur) * n_embd_head * n_head,
16069
+ 0);
16070
+ cb(Qcur, "Qcur", il);
16071
+
16072
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
16073
+ model.layers[il].attn_q_norm,
16074
+ model.layers[il].attn_q_norm_b,
16075
+ LLM_NORM, cb, il);
16076
+ cb(Qcur, "Qcur", il);
16077
+ }
16078
+
16079
+ if (model.layers[il].attn_k_norm) {
16080
+ Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
16081
+ ggml_element_size(Kcur) * n_embd_head,
16082
+ ggml_element_size(Kcur) * n_embd_head * n_head_kv,
16083
+ 0);
16084
+ cb(Kcur, "Kcur", il);
16085
+
16086
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
16087
+ model.layers[il].attn_k_norm,
16088
+ model.layers[il].attn_k_norm_b,
16089
+ LLM_NORM, cb, il);
16090
+ cb(Kcur, "Kcur", il);
16091
+ }
16092
+
16093
+ Qcur = ggml_rope_ext(
16094
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
16095
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
16096
+ ext_factor, attn_factor, beta_fast, beta_slow
16097
+ );
16098
+ cb(Qcur, "Qcur", il);
16099
+
16100
+ Kcur = ggml_rope_ext(
16101
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
16102
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
16103
+ ext_factor, attn_factor, beta_fast, beta_slow
16104
+ );
16105
+ cb(Kcur, "Kcur", il);
16106
+
16107
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
16108
+ model.layers[il].wo, nullptr,
16109
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
16110
+
16111
+ if (hparams.swin_norm) {
16112
+ cur = llm_build_norm(ctx0, cur, hparams,
16113
+ model.layers[il].attn_norm, NULL,
16114
+ LLM_NORM_RMS, cb, il);
16115
+ }
16116
+ }
16117
+
16118
+ if (il == n_layer - 1) {
16119
+ // skip computing output for unused tokens
16120
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
16121
+ n_tokens = n_outputs;
16122
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
16123
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
16124
+ }
16125
+
16126
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
16127
+ cb(ffn_inp, "ffn_inp", il);
16128
+
16129
+ // feed-forward network
16130
+ if (!hparams.swin_norm) {
16131
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
16132
+ model.layers[il].ffn_norm, NULL,
16133
+ LLM_NORM_RMS, cb, il);
16134
+ cb(cur, "ffn_norm", il);
16135
+ }
16136
+
16137
+ cur = llm_build_ffn(ctx0, lctx, cur,
16138
+ model.layers[il].ffn_up, NULL, NULL,
16139
+ model.layers[il].ffn_gate, NULL, NULL,
16140
+ model.layers[il].ffn_down, NULL, NULL,
16141
+ NULL,
16142
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
16143
+ cb(cur, "ffn_out", il);
16144
+
16145
+ if (hparams.swin_norm) {
16146
+ cur = llm_build_norm(ctx0, cur, hparams,
16147
+ model.layers[il].ffn_norm, NULL,
16148
+ LLM_NORM_RMS, cb, il);
16149
+ cb(cur, "ffn_norm", il);
16150
+ }
16151
+
16152
+ cur = ggml_add(ctx0, cur, ffn_inp);
16153
+ cb(cur, "ffn_out", il);
16154
+
16155
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
16156
+ cb(cur, "l_out", il);
16157
+
16158
+ // input for next layer
16159
+ inpL = cur;
16160
+ }
16161
+
16162
+ cur = inpL;
16163
+
16164
+ cur = llm_build_norm(ctx0, cur, hparams,
16165
+ model.output_norm, NULL,
16166
+ LLM_NORM_RMS, cb, -1);
16167
+ cb(cur, "result_norm", -1);
16168
+
16169
+ // lm_head
16170
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
16171
+ cb(cur, "result_output_with_img_logits", -1);
16172
+
16173
+ // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
16174
+ // Needs to be removed once image outputs are supported.
16175
+ int img_token_end_idx = 8196;
16176
+ int img_token_start_idx = 4;
16177
+ int num_img_tokens = img_token_end_idx - img_token_start_idx;
16178
+ // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
16179
+ // which ensures that text token values are always at least larger than image token values
16180
+ struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
16181
+ img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
16182
+ cb(img_logits, "img_logits", -1);
16183
+ cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
16184
+ cb(cur, "result_output", -1);
16185
+
16186
+ ggml_build_forward_expand(gf, cur);
16187
+
16188
+ return gf;
16189
+ }
16190
  };
16191
 
16192
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
 
16269
  switch (model.arch) {
16270
  case LLM_ARCH_LLAMA:
16271
  case LLM_ARCH_GRANITE:
16272
+ case LLM_ARCH_GRANITE_MOE:
16273
  {
16274
  result = llm.build_llama();
16275
  } break;
 
16447
  {
16448
  result = llm.build_rwkv6();
16449
  } break;
16450
+ case LLM_ARCH_CHAMELEON:
16451
+ {
16452
+ result = llm.build_chameleon();
16453
+ } break;
16454
  default:
16455
  GGML_ABORT("fatal error");
16456
  }
 
16737
  }
16738
  }
16739
 
16740
+ if (cparams.embeddings && (
16741
+ cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
16742
+ cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
16743
  const int64_t n_tokens = batch.n_tokens;
16744
  const int64_t n_seq_tokens = batch.n_seq_tokens;
16745
  const int64_t n_seqs = batch.n_seqs;
 
16754
  const llama_seq_id seq_id = batch.seq_id[s][0];
16755
 
16756
  // TODO: adapt limits to n_seqs when batch.equal_seqs is true
16757
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
16758
 
16759
  for (int i = 0; i < n_seq_tokens; ++i) {
16760
  const llama_pos pos = batch.pos[s*n_seq_tokens + i];
 
17025
  ggml_cgraph * gf,
17026
  int n_threads,
17027
  ggml_threadpool * threadpool) {
 
 
 
 
 
 
17028
  if (lctx.backend_cpu != nullptr) {
17029
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
17030
  ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
 
17288
  ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
17289
  }
17290
  } break;
17291
+ case LLAMA_POOLING_TYPE_RANK:
17292
+ {
17293
+ // extract the rerank score - a single float per sequence
17294
+ auto & embd_seq_out = lctx.embd_seq;
17295
+
17296
+ for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
17297
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
17298
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
17299
+ continue;
17300
+ }
17301
+ embd_seq_out[seq_id].resize(1);
17302
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
17303
+ }
17304
+ } break;
17305
  case LLAMA_POOLING_TYPE_UNSPECIFIED:
17306
  {
17307
  GGML_ABORT("unknown pooling type");
 
17508
  ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
17509
  }
17510
  } break;
17511
+ case LLAMA_POOLING_TYPE_RANK:
17512
+ {
17513
+ // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
17514
+ // wait for an encoder model that requires this pooling type in order to test it
17515
+ // https://github.com/ggerganov/llama.cpp/pull/9510
17516
+ GGML_ABORT("RANK pooling not implemented yet");
17517
+ }
17518
  case LLAMA_POOLING_TYPE_UNSPECIFIED:
17519
  {
17520
  GGML_ABORT("unknown pooling type");
 
19592
  case LLM_ARCH_DEEPSEEK2:
19593
  case LLM_ARCH_CHATGLM:
19594
  case LLM_ARCH_GRANITE:
19595
+ case LLM_ARCH_GRANITE_MOE:
19596
+ case LLM_ARCH_CHAMELEON:
19597
  return LLAMA_ROPE_TYPE_NORM;
19598
 
19599
  // the pairs of head values are offset by n_rot/2
examples/talk-llama/llama.h CHANGED
@@ -102,6 +102,7 @@ extern "C" {
102
  LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
103
  LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
104
  LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
 
105
  };
106
 
107
  enum llama_rope_type {
@@ -192,6 +193,7 @@ extern "C" {
192
  LLAMA_POOLING_TYPE_MEAN = 1,
193
  LLAMA_POOLING_TYPE_CLS = 2,
194
  LLAMA_POOLING_TYPE_LAST = 3,
 
195
  };
196
 
197
  enum llama_attention_type {
@@ -201,9 +203,9 @@ extern "C" {
201
  };
202
 
203
  enum llama_split_mode {
204
- LLAMA_SPLIT_MODE_NONE = 0, // single GPU
205
- LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
206
- LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
207
  };
208
 
209
  // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@@ -871,7 +873,8 @@ extern "C" {
871
 
872
  // Get the embeddings for a sequence id
873
  // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
874
- // shape: [n_embd] (1-dimensional)
 
875
  LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
876
 
877
  //
@@ -910,6 +913,8 @@ extern "C" {
910
  //
911
  // Tokenization
912
  //
 
 
913
 
914
  /// @details Convert the provided text into tokens.
915
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
 
102
  LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
103
  LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
104
  LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
105
+ LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
106
  };
107
 
108
  enum llama_rope_type {
 
193
  LLAMA_POOLING_TYPE_MEAN = 1,
194
  LLAMA_POOLING_TYPE_CLS = 2,
195
  LLAMA_POOLING_TYPE_LAST = 3,
196
+ LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
197
  };
198
 
199
  enum llama_attention_type {
 
203
  };
204
 
205
  enum llama_split_mode {
206
+ LLAMA_SPLIT_MODE_NONE = 0, // single GPU
207
+ LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
208
+ LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
209
  };
210
 
211
  // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
 
873
 
874
  // Get the embeddings for a sequence id
875
  // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
876
+ // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
877
+ // otherwise: float[n_embd] (1-dimensional)
878
  LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
879
 
880
  //
 
913
  //
914
  // Tokenization
915
  //
916
+ // The API is thread-safe.
917
+ //
918
 
919
  /// @details Convert the provided text into tokens.
920
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
src/whisper.cpp CHANGED
@@ -204,11 +204,6 @@ static bool ggml_graph_compute_helper(
204
  if (ggml_backend_is_blas(backend)) {
205
  ggml_backend_blas_set_n_threads(backend, n_threads);
206
  }
207
- #endif
208
- #ifdef GGML_USE_METAL
209
- if (ggml_backend_is_metal(backend)) {
210
- ggml_backend_metal_set_n_cb(backend, n_threads);
211
- }
212
  #endif
213
  }
214
 
 
204
  if (ggml_backend_is_blas(backend)) {
205
  ggml_backend_blas_set_n_threads(backend, n_threads);
206
  }
 
 
 
 
 
207
  #endif
208
  }
209