Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama-vocab.cpp +189 -106
- examples/talk-llama/llama-vocab.h +9 -0
- examples/talk-llama/llama.cpp +381 -18
- examples/talk-llama/llama.h +9 -4
- src/whisper.cpp +0 -5
examples/talk-llama/llama-vocab.cpp
CHANGED
|
@@ -50,7 +50,7 @@ struct naive_trie {
|
|
| 50 |
res.first->second.insert(key + 1, len - 1, value);
|
| 51 |
}
|
| 52 |
}
|
| 53 |
-
std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
|
| 54 |
if (len == 0 || offset == len) {
|
| 55 |
return std::make_pair(key, offset);
|
| 56 |
}
|
|
@@ -79,6 +79,15 @@ struct naive_trie {
|
|
| 79 |
// impl
|
| 80 |
//
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
| 83 |
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
| 84 |
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
|
@@ -187,10 +196,15 @@ struct llm_bigram_spm {
|
|
| 187 |
size_t size;
|
| 188 |
};
|
| 189 |
|
| 190 |
-
struct llm_tokenizer_spm {
|
| 191 |
-
llm_tokenizer_spm(const llama_vocab & vocab) :
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
|
|
|
| 194 |
// split string into utf8 chars
|
| 195 |
int index = 0;
|
| 196 |
size_t offs = 0;
|
|
@@ -271,7 +285,7 @@ private:
|
|
| 271 |
return;
|
| 272 |
}
|
| 273 |
|
| 274 |
-
resegment(symbols[p->second.first],
|
| 275 |
resegment(symbols[p->second.second], output);
|
| 276 |
}
|
| 277 |
|
|
@@ -279,7 +293,6 @@ private:
|
|
| 279 |
if (left == -1 || right == -1) {
|
| 280 |
return;
|
| 281 |
}
|
| 282 |
-
|
| 283 |
const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
|
| 284 |
auto token = vocab.token_to_id.find(text);
|
| 285 |
|
|
@@ -306,10 +319,11 @@ private:
|
|
| 306 |
}
|
| 307 |
|
| 308 |
const llama_vocab & vocab;
|
|
|
|
|
|
|
| 309 |
|
| 310 |
std::vector<llm_symbol> symbols;
|
| 311 |
llm_bigram_spm::queue work_queue;
|
| 312 |
-
|
| 313 |
std::map<std::string, std::pair<int, int>> rev_merge;
|
| 314 |
};
|
| 315 |
|
|
@@ -352,8 +366,8 @@ struct llm_bigram_bpe {
|
|
| 352 |
size_t size;
|
| 353 |
};
|
| 354 |
|
| 355 |
-
struct llm_tokenizer_bpe {
|
| 356 |
-
llm_tokenizer_bpe(const llama_vocab & vocab):
|
| 357 |
GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
|
| 358 |
switch (vocab.type_pre) {
|
| 359 |
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
|
@@ -450,6 +464,20 @@ struct llm_tokenizer_bpe {
|
|
| 450 |
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
| 451 |
};
|
| 452 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
default:
|
| 454 |
// default regex for BPE tokenization pre-processing
|
| 455 |
regex_exprs = {
|
|
@@ -462,7 +490,14 @@ struct llm_tokenizer_bpe {
|
|
| 462 |
}
|
| 463 |
}
|
| 464 |
|
| 465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
output.push_back(token_id);
|
| 467 |
}
|
| 468 |
|
|
@@ -501,12 +536,11 @@ struct llm_tokenizer_bpe {
|
|
| 501 |
|
| 502 |
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
| 503 |
int final_prev_index = -1;
|
| 504 |
-
|
| 505 |
-
const auto word_collection = unicode_regex_split(text, regex_exprs);
|
| 506 |
|
| 507 |
symbols_final.clear();
|
| 508 |
|
| 509 |
-
for (auto & word : word_collection) {
|
| 510 |
work_queue = llm_bigram_bpe::queue();
|
| 511 |
symbols.clear();
|
| 512 |
|
|
@@ -609,7 +643,6 @@ private:
|
|
| 609 |
if (left == -1 || right == -1) {
|
| 610 |
return;
|
| 611 |
}
|
| 612 |
-
|
| 613 |
std::string left_token = std::string(symbols[left].text, symbols[left].n);
|
| 614 |
std::string right_token = std::string(symbols[right].text, symbols[right].n);
|
| 615 |
|
|
@@ -633,12 +666,10 @@ private:
|
|
| 633 |
}
|
| 634 |
|
| 635 |
const llama_vocab & vocab;
|
| 636 |
-
|
| 637 |
-
std::vector<std::string> regex_exprs;
|
| 638 |
|
| 639 |
std::vector<llm_symbol> symbols;
|
| 640 |
std::vector<llm_symbol> symbols_final;
|
| 641 |
-
|
| 642 |
llm_bigram_bpe::queue work_queue;
|
| 643 |
};
|
| 644 |
|
|
@@ -646,15 +677,17 @@ private:
|
|
| 646 |
// WPM tokenizer
|
| 647 |
//
|
| 648 |
|
| 649 |
-
struct llm_tokenizer_wpm {
|
| 650 |
-
llm_tokenizer_wpm(const llama_vocab & vocab):
|
|
|
|
| 651 |
|
| 652 |
-
|
| 653 |
-
|
| 654 |
|
|
|
|
|
|
|
| 655 |
// normalize and split by whitespace
|
| 656 |
std::vector<std::string> words = preprocess(text);
|
| 657 |
-
|
| 658 |
// bos token prepended already
|
| 659 |
|
| 660 |
// find the longest tokens that form the words
|
|
@@ -699,7 +732,7 @@ struct llm_tokenizer_wpm {
|
|
| 699 |
}
|
| 700 |
|
| 701 |
// TODO: reduce string copies by using cpts_offs array
|
| 702 |
-
std::vector<std::string> preprocess(const std::string & text)
|
| 703 |
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
| 704 |
std::vector<std::string> words(1, "");
|
| 705 |
|
|
@@ -751,15 +784,18 @@ struct llm_tokenizer_wpm {
|
|
| 751 |
//(cpt >= 0xFF00 && cpt <= 0xFFEF);
|
| 752 |
}
|
| 753 |
|
|
|
|
| 754 |
const llama_vocab & vocab;
|
|
|
|
|
|
|
| 755 |
};
|
| 756 |
|
| 757 |
//
|
| 758 |
// UGM tokenizer
|
| 759 |
//
|
| 760 |
|
| 761 |
-
struct llm_tokenizer_ugm {
|
| 762 |
-
llm_tokenizer_ugm(const llama_vocab & vocab) :
|
| 763 |
if (vocab.precompiled_charsmap.size() > 0) {
|
| 764 |
size_t charsmap_offset = 0;
|
| 765 |
|
|
@@ -805,6 +841,30 @@ struct llm_tokenizer_ugm {
|
|
| 805 |
unknown_token_score = min_score - unknown_token_score_penalty;
|
| 806 |
}
|
| 807 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
/* This implementation is based on SentencePiece optimized Viterbi algorithm for
|
| 809 |
* unigram language models. The general idea is to:
|
| 810 |
* - move along the input sequence in steps of one UTF code point,
|
|
@@ -843,7 +903,7 @@ struct llm_tokenizer_ugm {
|
|
| 843 |
// traverse the token matcher trie to find a matching token
|
| 844 |
bool single_codepoint_token_found = false;
|
| 845 |
const struct best_tokenization & current_best = tokenization_results[input_offset];
|
| 846 |
-
const struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]);
|
| 847 |
|
| 848 |
while (prefix_offset <= input_len && node != NULL) {
|
| 849 |
// check if we found valid token in prefix
|
|
@@ -873,7 +933,7 @@ struct llm_tokenizer_ugm {
|
|
| 873 |
// if we didn't find a valid token corresponding to the whole UTF code point
|
| 874 |
// then use unknown token as the tokenization of this UTF code point
|
| 875 |
if (!single_codepoint_token_found) {
|
| 876 |
-
const double challenger_score = current_best.score_sum + unknown_token_score;
|
| 877 |
prefix_offset = input_offset + n_utf8_code_units;
|
| 878 |
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
| 879 |
if (challenger_score > current_champ.score_sum) {
|
|
@@ -905,7 +965,6 @@ struct llm_tokenizer_ugm {
|
|
| 905 |
}
|
| 906 |
|
| 907 |
private:
|
| 908 |
-
const llama_vocab & vocab;
|
| 909 |
|
| 910 |
// helper structure for returning normalization results
|
| 911 |
struct normalization_result {
|
|
@@ -918,7 +977,7 @@ private:
|
|
| 918 |
normalized->clear();
|
| 919 |
normalized->reserve(input.size() * 3);
|
| 920 |
|
| 921 |
-
const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " ";
|
| 922 |
|
| 923 |
bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
|
| 924 |
bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
|
|
@@ -1000,13 +1059,21 @@ private:
|
|
| 1000 |
size_t xcda_array_size;
|
| 1001 |
};
|
| 1002 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1003 |
struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
|
| 1004 |
if (input_offset == input.size()) {
|
| 1005 |
return { &input[input_offset], 0, 0 };
|
| 1006 |
}
|
| 1007 |
|
| 1008 |
// if input prefix matches some user-defined token return this token as normalization result
|
| 1009 |
-
auto user_defined_token_match =
|
|
|
|
| 1010 |
if (user_defined_token_match.second > 0) {
|
| 1011 |
return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
|
| 1012 |
}
|
|
@@ -1014,8 +1081,8 @@ private:
|
|
| 1014 |
size_t longest_prefix_length = 0;
|
| 1015 |
size_t longest_prefix_offset = 0;
|
| 1016 |
|
| 1017 |
-
if (xcda_array_size > 0) {
|
| 1018 |
-
struct xcda_array_view xcda_view(xcda_array, xcda_array_size);
|
| 1019 |
|
| 1020 |
// Find the longest normalized sequence matching the input prefix by walking
|
| 1021 |
// the XOR-compressed compact double array (XCDA) starting from the root node
|
|
@@ -1051,50 +1118,27 @@ private:
|
|
| 1051 |
|
| 1052 |
if (longest_prefix_length > 0) {
|
| 1053 |
// we have a match, so return the replacement sequence
|
| 1054 |
-
if (longest_prefix_offset >= prefix_replacements_size) {
|
| 1055 |
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
|
| 1056 |
}
|
| 1057 |
-
const char * prefix_replacement = &prefix_replacements[longest_prefix_offset];
|
| 1058 |
return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
|
| 1059 |
-
} else {
|
| 1060 |
-
// check if the input prefix contains a valid sequence of UTF-8 code units
|
| 1061 |
-
try {
|
| 1062 |
-
// if yes, return this sequence unmodified
|
| 1063 |
-
size_t prefix_offset = input_offset;
|
| 1064 |
-
unicode_cpt_from_utf8(input, prefix_offset);
|
| 1065 |
-
return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
|
| 1066 |
-
} catch (std::invalid_argument & /*ex*/) {
|
| 1067 |
-
// if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
|
| 1068 |
-
return { "\xEF\xBF\xBD", 3, 1 };
|
| 1069 |
-
}
|
| 1070 |
}
|
| 1071 |
-
}
|
| 1072 |
-
|
| 1073 |
-
// escaped space symbol - U+2581 (Lower One Eighth Block)
|
| 1074 |
-
const std::string escaped_space = "\xE2\x96\x81";
|
| 1075 |
|
| 1076 |
-
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
|
| 1082 |
-
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
-
|
| 1086 |
-
|
| 1087 |
-
size_t input_offset;
|
| 1088 |
-
float score_sum;
|
| 1089 |
-
};
|
| 1090 |
-
|
| 1091 |
-
float min_score = FLT_MAX;
|
| 1092 |
-
float max_score = -FLT_MAX;
|
| 1093 |
-
|
| 1094 |
-
float unknown_token_score_penalty = 10.0;
|
| 1095 |
-
float unknown_token_score;
|
| 1096 |
|
| 1097 |
-
|
|
|
|
| 1098 |
};
|
| 1099 |
|
| 1100 |
//
|
|
@@ -1155,8 +1199,8 @@ static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escape
|
|
| 1155 |
return output;
|
| 1156 |
}
|
| 1157 |
|
| 1158 |
-
struct llm_tokenizer_rwkv {
|
| 1159 |
-
llm_tokenizer_rwkv(const llama_vocab & vocab):
|
| 1160 |
// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
|
| 1161 |
// For now, we decode the vocab here into the lookup we'll use for tokenization.
|
| 1162 |
|
|
@@ -1168,11 +1212,17 @@ struct llm_tokenizer_rwkv {
|
|
| 1168 |
}
|
| 1169 |
}
|
| 1170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1171 |
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
| 1172 |
uint32_t position = 0;
|
| 1173 |
-
|
| 1174 |
while (position < text.size()) {
|
| 1175 |
-
const struct naive_trie * node = token_matcher.traverse(text[position]);
|
| 1176 |
if (node == NULL) {
|
| 1177 |
// no matching token found, add unknown token
|
| 1178 |
output.push_back(vocab.special_unk_id);
|
|
@@ -1197,11 +1247,33 @@ struct llm_tokenizer_rwkv {
|
|
| 1197 |
}
|
| 1198 |
}
|
| 1199 |
|
|
|
|
| 1200 |
const llama_vocab & vocab;
|
| 1201 |
-
|
| 1202 |
-
struct naive_trie token_matcher;
|
| 1203 |
};
|
| 1204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1205 |
//
|
| 1206 |
// (de-) tokenize
|
| 1207 |
//
|
|
@@ -1263,7 +1335,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
| 1263 |
|
| 1264 |
// if a fragment is text ( not yet processed )
|
| 1265 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 1266 |
-
auto & raw_text = fragment.raw_text;
|
| 1267 |
|
| 1268 |
auto raw_text_base_offset = fragment.offset;
|
| 1269 |
auto raw_text_base_length = fragment.length;
|
|
@@ -1362,7 +1434,13 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
| 1362 |
}
|
| 1363 |
}
|
| 1364 |
|
| 1365 |
-
std::vector<llama_vocab::id> llama_tokenize_internal(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1366 |
std::vector<llama_vocab::id> output;
|
| 1367 |
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
| 1368 |
|
|
@@ -1399,9 +1477,9 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
|
| 1399 |
#ifdef PRETOKENIZERDEBUG
|
| 1400 |
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 1401 |
#endif
|
| 1402 |
-
llm_tokenizer_spm tokenizer(vocab);
|
| 1403 |
llama_escape_whitespace(raw_text);
|
| 1404 |
-
|
|
|
|
| 1405 |
is_prev_special = false;
|
| 1406 |
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 1407 |
output.push_back(fragment.token);
|
|
@@ -1423,10 +1501,11 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
|
| 1423 |
} break;
|
| 1424 |
case LLAMA_VOCAB_TYPE_BPE:
|
| 1425 |
{
|
| 1426 |
-
|
| 1427 |
-
|
|
|
|
| 1428 |
if (add_special) {
|
| 1429 |
-
|
| 1430 |
}
|
| 1431 |
for (const auto & fragment : fragment_buffer) {
|
| 1432 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
@@ -1435,15 +1514,15 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
|
| 1435 |
#ifdef PRETOKENIZERDEBUG
|
| 1436 |
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 1437 |
#endif
|
| 1438 |
-
|
| 1439 |
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 1440 |
-
|
| 1441 |
}
|
| 1442 |
}
|
| 1443 |
|
| 1444 |
if (add_special) {
|
| 1445 |
-
|
| 1446 |
-
|
| 1447 |
}
|
| 1448 |
} break;
|
| 1449 |
case LLAMA_VOCAB_TYPE_WPM:
|
|
@@ -1453,7 +1532,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
|
| 1453 |
output.push_back(vocab.special_cls_id);
|
| 1454 |
}
|
| 1455 |
|
| 1456 |
-
|
| 1457 |
|
| 1458 |
for (const auto & fragment : fragment_buffer) {
|
| 1459 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
@@ -1462,7 +1541,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
|
| 1462 |
#ifdef PRETOKENIZERDEBUG
|
| 1463 |
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 1464 |
#endif
|
| 1465 |
-
|
| 1466 |
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 1467 |
output.push_back(fragment.token);
|
| 1468 |
}
|
|
@@ -1475,12 +1554,11 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
|
| 1475 |
} break;
|
| 1476 |
case LLAMA_VOCAB_TYPE_UGM:
|
| 1477 |
{
|
| 1478 |
-
|
| 1479 |
-
|
| 1480 |
-
if (add_special && vocab.tokenizer_add_bos != 0) {
|
| 1481 |
GGML_ASSERT(vocab.special_bos_id != -1);
|
| 1482 |
output.push_back(vocab.special_bos_id);
|
| 1483 |
}
|
|
|
|
| 1484 |
|
| 1485 |
for (const auto & fragment : fragment_buffer) {
|
| 1486 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
@@ -1488,26 +1566,27 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
|
| 1488 |
#ifdef PRETOKENIZERDEBUG
|
| 1489 |
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 1490 |
#endif
|
| 1491 |
-
|
| 1492 |
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 1493 |
output.push_back(fragment.token);
|
| 1494 |
}
|
| 1495 |
}
|
| 1496 |
|
| 1497 |
-
if (add_special && vocab.tokenizer_add_bos
|
| 1498 |
LLAMA_LOG_WARN(
|
| 1499 |
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
| 1500 |
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
| 1501 |
"Are you sure this is what you want?\n", __FUNCTION__);
|
| 1502 |
}
|
| 1503 |
|
| 1504 |
-
if (add_special && vocab.tokenizer_add_eos
|
| 1505 |
GGML_ASSERT(vocab.special_eos_id != -1);
|
| 1506 |
output.push_back(vocab.special_eos_id);
|
| 1507 |
}
|
| 1508 |
} break;
|
| 1509 |
case LLAMA_VOCAB_TYPE_RWKV:
|
| 1510 |
{
|
|
|
|
| 1511 |
for (const auto & fragment : fragment_buffer) {
|
| 1512 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 1513 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
@@ -1516,8 +1595,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
|
| 1516 |
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 1517 |
#endif
|
| 1518 |
|
| 1519 |
-
|
| 1520 |
-
tokenizer.tokenize(raw_text, output);
|
| 1521 |
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 1522 |
output.push_back(fragment.token);
|
| 1523 |
}
|
|
@@ -1630,13 +1708,13 @@ llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
|
|
| 1630 |
}
|
| 1631 |
|
| 1632 |
int32_t llama_tokenize_impl(
|
| 1633 |
-
|
| 1634 |
-
|
| 1635 |
-
|
| 1636 |
-
|
| 1637 |
-
|
| 1638 |
-
|
| 1639 |
-
|
| 1640 |
auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
|
| 1641 |
if (n_tokens_max < (int) res.size()) {
|
| 1642 |
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
|
@@ -1713,11 +1791,13 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
|
|
| 1713 |
// suppressing them like CONTROL tokens.
|
| 1714 |
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
|
| 1715 |
return _try_copy(token_text.data(), token_text.size());
|
| 1716 |
-
}
|
|
|
|
| 1717 |
std::string result = token_text;
|
| 1718 |
llama_unescape_whitespace(result);
|
| 1719 |
return _try_copy(result.data(), result.size());
|
| 1720 |
-
}
|
|
|
|
| 1721 |
char byte = (char) llama_token_to_byte(vocab, token);
|
| 1722 |
return _try_copy((char*) &byte, 1);
|
| 1723 |
}
|
|
@@ -1728,7 +1808,8 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
|
|
| 1728 |
// suppressing them like CONTROL tokens.
|
| 1729 |
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
|
| 1730 |
return _try_copy(token_text.data(), token_text.size());
|
| 1731 |
-
}
|
|
|
|
| 1732 |
std::string result = llama_decode_text(token_text);
|
| 1733 |
return _try_copy(result.data(), result.size());
|
| 1734 |
}
|
|
@@ -1761,6 +1842,8 @@ int32_t llama_detokenize_impl(
|
|
| 1761 |
int32_t text_len_max,
|
| 1762 |
bool remove_special,
|
| 1763 |
bool unparse_special) {
|
|
|
|
|
|
|
| 1764 |
int32_t avail = text_len_max;
|
| 1765 |
int32_t total = 0;
|
| 1766 |
|
|
|
|
| 50 |
res.first->second.insert(key + 1, len - 1, value);
|
| 51 |
}
|
| 52 |
}
|
| 53 |
+
std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
|
| 54 |
if (len == 0 || offset == len) {
|
| 55 |
return std::make_pair(key, offset);
|
| 56 |
}
|
|
|
|
| 79 |
// impl
|
| 80 |
//
|
| 81 |
|
| 82 |
+
struct llm_tokenizer {
|
| 83 |
+
llm_tokenizer() {}
|
| 84 |
+
virtual ~llm_tokenizer() = default;
|
| 85 |
+
};
|
| 86 |
+
|
| 87 |
+
llama_vocab::~llama_vocab() {
|
| 88 |
+
delete tokenizer;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
| 92 |
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
| 93 |
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
|
|
|
| 196 |
size_t size;
|
| 197 |
};
|
| 198 |
|
| 199 |
+
struct llm_tokenizer_spm : llm_tokenizer {
|
| 200 |
+
llm_tokenizer_spm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
|
| 201 |
+
};
|
| 202 |
+
|
| 203 |
+
struct llm_tokenizer_spm_session {
|
| 204 |
+
llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
|
| 205 |
|
| 206 |
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
| 207 |
+
|
| 208 |
// split string into utf8 chars
|
| 209 |
int index = 0;
|
| 210 |
size_t offs = 0;
|
|
|
|
| 285 |
return;
|
| 286 |
}
|
| 287 |
|
| 288 |
+
resegment(symbols[p->second.first], output);
|
| 289 |
resegment(symbols[p->second.second], output);
|
| 290 |
}
|
| 291 |
|
|
|
|
| 293 |
if (left == -1 || right == -1) {
|
| 294 |
return;
|
| 295 |
}
|
|
|
|
| 296 |
const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
|
| 297 |
auto token = vocab.token_to_id.find(text);
|
| 298 |
|
|
|
|
| 319 |
}
|
| 320 |
|
| 321 |
const llama_vocab & vocab;
|
| 322 |
+
// currently unused
|
| 323 |
+
// const llm_tokenizer_spm * spm_tokenizer;
|
| 324 |
|
| 325 |
std::vector<llm_symbol> symbols;
|
| 326 |
llm_bigram_spm::queue work_queue;
|
|
|
|
| 327 |
std::map<std::string, std::pair<int, int>> rev_merge;
|
| 328 |
};
|
| 329 |
|
|
|
|
| 366 |
size_t size;
|
| 367 |
};
|
| 368 |
|
| 369 |
+
struct llm_tokenizer_bpe : llm_tokenizer {
|
| 370 |
+
llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer() {
|
| 371 |
GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
|
| 372 |
switch (vocab.type_pre) {
|
| 373 |
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
|
|
|
| 464 |
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
| 465 |
};
|
| 466 |
break;
|
| 467 |
+
case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
|
| 468 |
+
// Note: in theory, the special token (sentinel and image token) regex_exprs below
|
| 469 |
+
// are unnecessary, as they are split in `tokenizer_st_partition` anyway.
|
| 470 |
+
// However, since the upstream pre-tokenizer uses them, they are also
|
| 471 |
+
// included here (see https://huggingface.co/facebook/chameleon-7b).
|
| 472 |
+
regex_exprs = {
|
| 473 |
+
"<sentinel:[0-9]+>", // Sentinel tokens
|
| 474 |
+
"(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens
|
| 475 |
+
"([\\t\\n]| | )", // directly from tokenizer.json
|
| 476 |
+
"\\p{N}", // Individual digits
|
| 477 |
+
"[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated
|
| 478 |
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
| 479 |
+
};
|
| 480 |
+
break;
|
| 481 |
default:
|
| 482 |
// default regex for BPE tokenization pre-processing
|
| 483 |
regex_exprs = {
|
|
|
|
| 490 |
}
|
| 491 |
}
|
| 492 |
|
| 493 |
+
std::vector<std::string> regex_exprs;
|
| 494 |
+
};
|
| 495 |
+
|
| 496 |
+
struct llm_tokenizer_bpe_session {
|
| 497 |
+
llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab),
|
| 498 |
+
bpe_tokenizer(static_cast<const llm_tokenizer_bpe *>(vocab.tokenizer)) {}
|
| 499 |
+
|
| 500 |
+
static void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) {
|
| 501 |
output.push_back(token_id);
|
| 502 |
}
|
| 503 |
|
|
|
|
| 536 |
|
| 537 |
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
| 538 |
int final_prev_index = -1;
|
| 539 |
+
const auto word_collection = unicode_regex_split(text, bpe_tokenizer->regex_exprs);
|
|
|
|
| 540 |
|
| 541 |
symbols_final.clear();
|
| 542 |
|
| 543 |
+
for (const auto & word : word_collection) {
|
| 544 |
work_queue = llm_bigram_bpe::queue();
|
| 545 |
symbols.clear();
|
| 546 |
|
|
|
|
| 643 |
if (left == -1 || right == -1) {
|
| 644 |
return;
|
| 645 |
}
|
|
|
|
| 646 |
std::string left_token = std::string(symbols[left].text, symbols[left].n);
|
| 647 |
std::string right_token = std::string(symbols[right].text, symbols[right].n);
|
| 648 |
|
|
|
|
| 666 |
}
|
| 667 |
|
| 668 |
const llama_vocab & vocab;
|
| 669 |
+
const llm_tokenizer_bpe * bpe_tokenizer;
|
|
|
|
| 670 |
|
| 671 |
std::vector<llm_symbol> symbols;
|
| 672 |
std::vector<llm_symbol> symbols_final;
|
|
|
|
| 673 |
llm_bigram_bpe::queue work_queue;
|
| 674 |
};
|
| 675 |
|
|
|
|
| 677 |
// WPM tokenizer
|
| 678 |
//
|
| 679 |
|
| 680 |
+
struct llm_tokenizer_wpm : llm_tokenizer {
|
| 681 |
+
llm_tokenizer_wpm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
|
| 682 |
+
};
|
| 683 |
|
| 684 |
+
struct llm_tokenizer_wpm_session {
|
| 685 |
+
llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
|
| 686 |
|
| 687 |
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
| 688 |
+
const auto & token_map = vocab.token_to_id;
|
| 689 |
// normalize and split by whitespace
|
| 690 |
std::vector<std::string> words = preprocess(text);
|
|
|
|
| 691 |
// bos token prepended already
|
| 692 |
|
| 693 |
// find the longest tokens that form the words
|
|
|
|
| 732 |
}
|
| 733 |
|
| 734 |
// TODO: reduce string copies by using cpts_offs array
|
| 735 |
+
static std::vector<std::string> preprocess(const std::string & text) {
|
| 736 |
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
| 737 |
std::vector<std::string> words(1, "");
|
| 738 |
|
|
|
|
| 784 |
//(cpt >= 0xFF00 && cpt <= 0xFFEF);
|
| 785 |
}
|
| 786 |
|
| 787 |
+
private:
|
| 788 |
const llama_vocab & vocab;
|
| 789 |
+
// currently unused
|
| 790 |
+
// const llm_tokenizer_wpm * wpm_tokenizer;
|
| 791 |
};
|
| 792 |
|
| 793 |
//
|
| 794 |
// UGM tokenizer
|
| 795 |
//
|
| 796 |
|
| 797 |
+
struct llm_tokenizer_ugm : llm_tokenizer {
|
| 798 |
+
llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer() {
|
| 799 |
if (vocab.precompiled_charsmap.size() > 0) {
|
| 800 |
size_t charsmap_offset = 0;
|
| 801 |
|
|
|
|
| 841 |
unknown_token_score = min_score - unknown_token_score_penalty;
|
| 842 |
}
|
| 843 |
|
| 844 |
+
// escaped space symbol - U+2581 (Lower One Eighth Block)
|
| 845 |
+
const std::string escaped_space = "\xE2\x96\x81";
|
| 846 |
+
|
| 847 |
+
const char * prefix_replacements = NULL;
|
| 848 |
+
size_t prefix_replacements_size = 0;
|
| 849 |
+
|
| 850 |
+
const uint32_t * xcda_array = NULL;
|
| 851 |
+
size_t xcda_array_size = 0;
|
| 852 |
+
|
| 853 |
+
struct naive_trie user_defined_token_matcher;
|
| 854 |
+
|
| 855 |
+
float min_score = FLT_MAX;
|
| 856 |
+
float max_score = -FLT_MAX;
|
| 857 |
+
|
| 858 |
+
float unknown_token_score_penalty = 10.0;
|
| 859 |
+
float unknown_token_score;
|
| 860 |
+
|
| 861 |
+
struct naive_trie token_matcher;
|
| 862 |
+
};
|
| 863 |
+
|
| 864 |
+
struct llm_tokenizer_ugm_session {
|
| 865 |
+
llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab),
|
| 866 |
+
ugm_tokenizer(static_cast<const llm_tokenizer_ugm *>(vocab.tokenizer)) {}
|
| 867 |
+
|
| 868 |
/* This implementation is based on SentencePiece optimized Viterbi algorithm for
|
| 869 |
* unigram language models. The general idea is to:
|
| 870 |
* - move along the input sequence in steps of one UTF code point,
|
|
|
|
| 903 |
// traverse the token matcher trie to find a matching token
|
| 904 |
bool single_codepoint_token_found = false;
|
| 905 |
const struct best_tokenization & current_best = tokenization_results[input_offset];
|
| 906 |
+
const struct naive_trie * node = ugm_tokenizer->token_matcher.traverse(normalized[prefix_offset++]);
|
| 907 |
|
| 908 |
while (prefix_offset <= input_len && node != NULL) {
|
| 909 |
// check if we found valid token in prefix
|
|
|
|
| 933 |
// if we didn't find a valid token corresponding to the whole UTF code point
|
| 934 |
// then use unknown token as the tokenization of this UTF code point
|
| 935 |
if (!single_codepoint_token_found) {
|
| 936 |
+
const double challenger_score = current_best.score_sum + ugm_tokenizer->unknown_token_score;
|
| 937 |
prefix_offset = input_offset + n_utf8_code_units;
|
| 938 |
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
| 939 |
if (challenger_score > current_champ.score_sum) {
|
|
|
|
| 965 |
}
|
| 966 |
|
| 967 |
private:
|
|
|
|
| 968 |
|
| 969 |
// helper structure for returning normalization results
|
| 970 |
struct normalization_result {
|
|
|
|
| 977 |
normalized->clear();
|
| 978 |
normalized->reserve(input.size() * 3);
|
| 979 |
|
| 980 |
+
const std::string space = vocab.tokenizer_escape_whitespaces ? ugm_tokenizer->escaped_space : " ";
|
| 981 |
|
| 982 |
bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
|
| 983 |
bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
|
|
|
|
| 1059 |
size_t xcda_array_size;
|
| 1060 |
};
|
| 1061 |
|
| 1062 |
+
// this structure stores the best tokenization so far at input_offset
|
| 1063 |
+
struct best_tokenization {
|
| 1064 |
+
llama_token token_id;
|
| 1065 |
+
size_t input_offset;
|
| 1066 |
+
float score_sum;
|
| 1067 |
+
};
|
| 1068 |
+
|
| 1069 |
struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
|
| 1070 |
if (input_offset == input.size()) {
|
| 1071 |
return { &input[input_offset], 0, 0 };
|
| 1072 |
}
|
| 1073 |
|
| 1074 |
// if input prefix matches some user-defined token return this token as normalization result
|
| 1075 |
+
auto user_defined_token_match =
|
| 1076 |
+
ugm_tokenizer->user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
|
| 1077 |
if (user_defined_token_match.second > 0) {
|
| 1078 |
return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
|
| 1079 |
}
|
|
|
|
| 1081 |
size_t longest_prefix_length = 0;
|
| 1082 |
size_t longest_prefix_offset = 0;
|
| 1083 |
|
| 1084 |
+
if (ugm_tokenizer->xcda_array_size > 0) {
|
| 1085 |
+
struct xcda_array_view xcda_view(ugm_tokenizer->xcda_array, ugm_tokenizer->xcda_array_size);
|
| 1086 |
|
| 1087 |
// Find the longest normalized sequence matching the input prefix by walking
|
| 1088 |
// the XOR-compressed compact double array (XCDA) starting from the root node
|
|
|
|
| 1118 |
|
| 1119 |
if (longest_prefix_length > 0) {
|
| 1120 |
// we have a match, so return the replacement sequence
|
| 1121 |
+
if (longest_prefix_offset >= ugm_tokenizer->prefix_replacements_size) {
|
| 1122 |
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
|
| 1123 |
}
|
| 1124 |
+
const char * prefix_replacement = &(ugm_tokenizer->prefix_replacements)[longest_prefix_offset];
|
| 1125 |
return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1126 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1127 |
|
| 1128 |
+
// check if the input prefix contains a valid sequence of UTF-8 code units
|
| 1129 |
+
try {
|
| 1130 |
+
// if yes, return this sequence unmodified
|
| 1131 |
+
size_t prefix_offset = input_offset;
|
| 1132 |
+
unicode_cpt_from_utf8(input, prefix_offset);
|
| 1133 |
+
return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
|
| 1134 |
+
} catch (std::invalid_argument & /*ex*/) {
|
| 1135 |
+
// if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
|
| 1136 |
+
return { "\xEF\xBF\xBD", 3, 1 };
|
| 1137 |
+
}
|
| 1138 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1139 |
|
| 1140 |
+
const llama_vocab & vocab;
|
| 1141 |
+
const llm_tokenizer_ugm * ugm_tokenizer;
|
| 1142 |
};
|
| 1143 |
|
| 1144 |
//
|
|
|
|
| 1199 |
return output;
|
| 1200 |
}
|
| 1201 |
|
| 1202 |
+
struct llm_tokenizer_rwkv : llm_tokenizer {
|
| 1203 |
+
llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer() {
|
| 1204 |
// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
|
| 1205 |
// For now, we decode the vocab here into the lookup we'll use for tokenization.
|
| 1206 |
|
|
|
|
| 1212 |
}
|
| 1213 |
}
|
| 1214 |
|
| 1215 |
+
struct naive_trie token_matcher;
|
| 1216 |
+
};
|
| 1217 |
+
|
| 1218 |
+
struct llm_tokenizer_rwkv_session {
|
| 1219 |
+
llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab),
|
| 1220 |
+
rwkv_tokenizer(static_cast<const llm_tokenizer_rwkv &>(*vocab.tokenizer)) {}
|
| 1221 |
+
|
| 1222 |
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
| 1223 |
uint32_t position = 0;
|
|
|
|
| 1224 |
while (position < text.size()) {
|
| 1225 |
+
const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]);
|
| 1226 |
if (node == NULL) {
|
| 1227 |
// no matching token found, add unknown token
|
| 1228 |
output.push_back(vocab.special_unk_id);
|
|
|
|
| 1247 |
}
|
| 1248 |
}
|
| 1249 |
|
| 1250 |
+
private:
|
| 1251 |
const llama_vocab & vocab;
|
| 1252 |
+
const llm_tokenizer_rwkv & rwkv_tokenizer;
|
|
|
|
| 1253 |
};
|
| 1254 |
|
| 1255 |
+
void llama_vocab::init_tokenizer() {
|
| 1256 |
+
switch (type) {
|
| 1257 |
+
case LLAMA_VOCAB_TYPE_SPM:
|
| 1258 |
+
tokenizer = new llm_tokenizer_spm(*this);
|
| 1259 |
+
break;
|
| 1260 |
+
case LLAMA_VOCAB_TYPE_BPE:
|
| 1261 |
+
tokenizer = new llm_tokenizer_bpe(*this);
|
| 1262 |
+
break;
|
| 1263 |
+
case LLAMA_VOCAB_TYPE_WPM:
|
| 1264 |
+
tokenizer = new llm_tokenizer_wpm(*this);
|
| 1265 |
+
break;
|
| 1266 |
+
case LLAMA_VOCAB_TYPE_UGM:
|
| 1267 |
+
tokenizer = new llm_tokenizer_ugm(*this);
|
| 1268 |
+
break;
|
| 1269 |
+
case LLAMA_VOCAB_TYPE_RWKV:
|
| 1270 |
+
tokenizer = new llm_tokenizer_rwkv(*this);
|
| 1271 |
+
break;
|
| 1272 |
+
default:
|
| 1273 |
+
GGML_ABORT("unsupported vocab type");
|
| 1274 |
+
}
|
| 1275 |
+
}
|
| 1276 |
+
|
| 1277 |
//
|
| 1278 |
// (de-) tokenize
|
| 1279 |
//
|
|
|
|
| 1335 |
|
| 1336 |
// if a fragment is text ( not yet processed )
|
| 1337 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 1338 |
+
const auto & raw_text = fragment.raw_text;
|
| 1339 |
|
| 1340 |
auto raw_text_base_offset = fragment.offset;
|
| 1341 |
auto raw_text_base_length = fragment.length;
|
|
|
|
| 1434 |
}
|
| 1435 |
}
|
| 1436 |
|
| 1437 |
+
std::vector<llama_vocab::id> llama_tokenize_internal(
|
| 1438 |
+
const llama_vocab & vocab,
|
| 1439 |
+
std::string raw_text,
|
| 1440 |
+
bool add_special,
|
| 1441 |
+
bool parse_special) {
|
| 1442 |
+
GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
|
| 1443 |
+
|
| 1444 |
std::vector<llama_vocab::id> output;
|
| 1445 |
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
| 1446 |
|
|
|
|
| 1477 |
#ifdef PRETOKENIZERDEBUG
|
| 1478 |
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 1479 |
#endif
|
|
|
|
| 1480 |
llama_escape_whitespace(raw_text);
|
| 1481 |
+
llm_tokenizer_spm_session session(vocab);
|
| 1482 |
+
session.tokenize(raw_text, output);
|
| 1483 |
is_prev_special = false;
|
| 1484 |
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 1485 |
output.push_back(fragment.token);
|
|
|
|
| 1501 |
} break;
|
| 1502 |
case LLAMA_VOCAB_TYPE_BPE:
|
| 1503 |
{
|
| 1504 |
+
llm_tokenizer_bpe_session session(vocab);
|
| 1505 |
+
// it calls some other methods that are not exist in llm_tokenizer,
|
| 1506 |
+
// here just cast it to bpe tokenizer object
|
| 1507 |
if (add_special) {
|
| 1508 |
+
session.append_bos(output);
|
| 1509 |
}
|
| 1510 |
for (const auto & fragment : fragment_buffer) {
|
| 1511 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
|
|
| 1514 |
#ifdef PRETOKENIZERDEBUG
|
| 1515 |
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 1516 |
#endif
|
| 1517 |
+
session.tokenize(raw_text, output);
|
| 1518 |
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 1519 |
+
session.append(fragment.token, output);
|
| 1520 |
}
|
| 1521 |
}
|
| 1522 |
|
| 1523 |
if (add_special) {
|
| 1524 |
+
session.append_eos(output);
|
| 1525 |
+
session.check_double_bos_eos(output);
|
| 1526 |
}
|
| 1527 |
} break;
|
| 1528 |
case LLAMA_VOCAB_TYPE_WPM:
|
|
|
|
| 1532 |
output.push_back(vocab.special_cls_id);
|
| 1533 |
}
|
| 1534 |
|
| 1535 |
+
llm_tokenizer_wpm_session session(vocab);
|
| 1536 |
|
| 1537 |
for (const auto & fragment : fragment_buffer) {
|
| 1538 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
|
|
| 1541 |
#ifdef PRETOKENIZERDEBUG
|
| 1542 |
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 1543 |
#endif
|
| 1544 |
+
session.tokenize(raw_text, output);
|
| 1545 |
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 1546 |
output.push_back(fragment.token);
|
| 1547 |
}
|
|
|
|
| 1554 |
} break;
|
| 1555 |
case LLAMA_VOCAB_TYPE_UGM:
|
| 1556 |
{
|
| 1557 |
+
if (add_special && vocab.tokenizer_add_bos) {
|
|
|
|
|
|
|
| 1558 |
GGML_ASSERT(vocab.special_bos_id != -1);
|
| 1559 |
output.push_back(vocab.special_bos_id);
|
| 1560 |
}
|
| 1561 |
+
llm_tokenizer_ugm_session session(vocab);
|
| 1562 |
|
| 1563 |
for (const auto & fragment : fragment_buffer) {
|
| 1564 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
|
|
| 1566 |
#ifdef PRETOKENIZERDEBUG
|
| 1567 |
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 1568 |
#endif
|
| 1569 |
+
session.tokenize(raw_text, output);
|
| 1570 |
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 1571 |
output.push_back(fragment.token);
|
| 1572 |
}
|
| 1573 |
}
|
| 1574 |
|
| 1575 |
+
if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
| 1576 |
LLAMA_LOG_WARN(
|
| 1577 |
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
| 1578 |
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
| 1579 |
"Are you sure this is what you want?\n", __FUNCTION__);
|
| 1580 |
}
|
| 1581 |
|
| 1582 |
+
if (add_special && vocab.tokenizer_add_eos) {
|
| 1583 |
GGML_ASSERT(vocab.special_eos_id != -1);
|
| 1584 |
output.push_back(vocab.special_eos_id);
|
| 1585 |
}
|
| 1586 |
} break;
|
| 1587 |
case LLAMA_VOCAB_TYPE_RWKV:
|
| 1588 |
{
|
| 1589 |
+
llm_tokenizer_rwkv_session session(vocab);
|
| 1590 |
for (const auto & fragment : fragment_buffer) {
|
| 1591 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 1592 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
|
|
| 1595 |
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 1596 |
#endif
|
| 1597 |
|
| 1598 |
+
session.tokenize(raw_text, output);
|
|
|
|
| 1599 |
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 1600 |
output.push_back(fragment.token);
|
| 1601 |
}
|
|
|
|
| 1708 |
}
|
| 1709 |
|
| 1710 |
int32_t llama_tokenize_impl(
|
| 1711 |
+
const struct llama_vocab & vocab,
|
| 1712 |
+
const char * text,
|
| 1713 |
+
int32_t text_len,
|
| 1714 |
+
llama_token * tokens,
|
| 1715 |
+
int32_t n_tokens_max,
|
| 1716 |
+
bool add_special,
|
| 1717 |
+
bool parse_special) {
|
| 1718 |
auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
|
| 1719 |
if (n_tokens_max < (int) res.size()) {
|
| 1720 |
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
|
|
|
| 1791 |
// suppressing them like CONTROL tokens.
|
| 1792 |
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
|
| 1793 |
return _try_copy(token_text.data(), token_text.size());
|
| 1794 |
+
}
|
| 1795 |
+
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
| 1796 |
std::string result = token_text;
|
| 1797 |
llama_unescape_whitespace(result);
|
| 1798 |
return _try_copy(result.data(), result.size());
|
| 1799 |
+
}
|
| 1800 |
+
if (attr & LLAMA_TOKEN_ATTR_BYTE) {
|
| 1801 |
char byte = (char) llama_token_to_byte(vocab, token);
|
| 1802 |
return _try_copy((char*) &byte, 1);
|
| 1803 |
}
|
|
|
|
| 1808 |
// suppressing them like CONTROL tokens.
|
| 1809 |
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
|
| 1810 |
return _try_copy(token_text.data(), token_text.size());
|
| 1811 |
+
}
|
| 1812 |
+
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
| 1813 |
std::string result = llama_decode_text(token_text);
|
| 1814 |
return _try_copy(result.data(), result.size());
|
| 1815 |
}
|
|
|
|
| 1842 |
int32_t text_len_max,
|
| 1843 |
bool remove_special,
|
| 1844 |
bool unparse_special) {
|
| 1845 |
+
GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
|
| 1846 |
+
|
| 1847 |
int32_t avail = text_len_max;
|
| 1848 |
int32_t total = 0;
|
| 1849 |
|
examples/talk-llama/llama-vocab.h
CHANGED
|
@@ -8,6 +8,8 @@
|
|
| 8 |
#include <map>
|
| 9 |
#include <set>
|
| 10 |
|
|
|
|
|
|
|
| 11 |
struct llama_vocab {
|
| 12 |
using id = llama_token;
|
| 13 |
using token = std::string;
|
|
@@ -65,7 +67,14 @@ struct llama_vocab {
|
|
| 65 |
|
| 66 |
std::vector<char> precompiled_charsmap;
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
|
|
|
|
|
|
| 69 |
};
|
| 70 |
|
| 71 |
//
|
|
|
|
| 8 |
#include <map>
|
| 9 |
#include <set>
|
| 10 |
|
| 11 |
+
struct llm_tokenizer;
|
| 12 |
+
|
| 13 |
struct llama_vocab {
|
| 14 |
using id = llama_token;
|
| 15 |
using token = std::string;
|
|
|
|
| 67 |
|
| 68 |
std::vector<char> precompiled_charsmap;
|
| 69 |
|
| 70 |
+
llm_tokenizer * tokenizer = nullptr;
|
| 71 |
+
|
| 72 |
+
llama_vocab() = default;
|
| 73 |
+
~llama_vocab();
|
| 74 |
+
|
| 75 |
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
| 76 |
+
|
| 77 |
+
void init_tokenizer();
|
| 78 |
};
|
| 79 |
|
| 80 |
//
|
examples/talk-llama/llama.cpp
CHANGED
|
@@ -215,6 +215,8 @@ enum llm_arch {
|
|
| 215 |
LLM_ARCH_EXAONE,
|
| 216 |
LLM_ARCH_RWKV6,
|
| 217 |
LLM_ARCH_GRANITE,
|
|
|
|
|
|
|
| 218 |
LLM_ARCH_UNKNOWN,
|
| 219 |
};
|
| 220 |
|
|
@@ -266,6 +268,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 266 |
{ LLM_ARCH_EXAONE, "exaone" },
|
| 267 |
{ LLM_ARCH_RWKV6, "rwkv6" },
|
| 268 |
{ LLM_ARCH_GRANITE, "granite" },
|
|
|
|
|
|
|
| 269 |
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
| 270 |
};
|
| 271 |
|
|
@@ -302,6 +306,7 @@ enum llm_kv {
|
|
| 302 |
LLM_KV_DECODER_START_TOKEN_ID,
|
| 303 |
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
| 304 |
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
|
|
|
| 305 |
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
| 306 |
LLM_KV_TIME_MIX_EXTRA_DIM,
|
| 307 |
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
|
@@ -409,6 +414,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 409 |
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
| 410 |
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
| 411 |
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
|
|
|
| 412 |
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
| 413 |
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
|
| 414 |
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
|
@@ -600,6 +606,8 @@ enum llm_tensor {
|
|
| 600 |
LLM_TENSOR_ENC_FFN_DOWN,
|
| 601 |
LLM_TENSOR_ENC_FFN_UP,
|
| 602 |
LLM_TENSOR_ENC_OUTPUT_NORM,
|
|
|
|
|
|
|
| 603 |
};
|
| 604 |
|
| 605 |
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
|
@@ -787,6 +795,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
| 787 |
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
| 788 |
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 789 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
|
|
|
|
|
| 790 |
},
|
| 791 |
},
|
| 792 |
{
|
|
@@ -822,6 +832,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
| 822 |
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 823 |
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 824 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
|
|
| 825 |
},
|
| 826 |
},
|
| 827 |
{
|
|
@@ -1467,6 +1478,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
| 1467 |
{
|
| 1468 |
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1469 |
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
|
|
| 1470 |
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1471 |
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 1472 |
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
@@ -1478,6 +1490,43 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
| 1478 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 1479 |
},
|
| 1480 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1481 |
{
|
| 1482 |
LLM_ARCH_UNKNOWN,
|
| 1483 |
{
|
|
@@ -2341,6 +2390,7 @@ struct llama_hparams {
|
|
| 2341 |
bool vocab_only;
|
| 2342 |
bool rope_finetuned;
|
| 2343 |
bool use_par_res;
|
|
|
|
| 2344 |
|
| 2345 |
uint32_t n_vocab;
|
| 2346 |
uint32_t n_ctx_train; // context size the model was trained on
|
|
@@ -2396,7 +2446,7 @@ struct llama_hparams {
|
|
| 2396 |
float f_max_alibi_bias = 0.0f;
|
| 2397 |
float f_logit_scale = 0.0f;
|
| 2398 |
|
| 2399 |
-
// Additional scale factors (Granite)
|
| 2400 |
float f_residual_scale = 0.0f;
|
| 2401 |
float f_embedding_scale = 0.0f;
|
| 2402 |
float f_attention_scale = 0.0f;
|
|
@@ -2849,6 +2899,7 @@ struct llama_model {
|
|
| 2849 |
llama_hparams hparams = {};
|
| 2850 |
llama_vocab vocab;
|
| 2851 |
|
|
|
|
| 2852 |
struct ggml_tensor * tok_embd;
|
| 2853 |
struct ggml_tensor * type_embd;
|
| 2854 |
struct ggml_tensor * pos_embd;
|
|
@@ -2861,6 +2912,12 @@ struct llama_model {
|
|
| 2861 |
struct ggml_tensor * output_b;
|
| 2862 |
struct ggml_tensor * output_norm_enc;
|
| 2863 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2864 |
std::vector<llama_layer> layers;
|
| 2865 |
|
| 2866 |
llama_split_mode split_mode;
|
|
@@ -5445,8 +5502,10 @@ static void llm_load_hparams(
|
|
| 5445 |
}
|
| 5446 |
} else {
|
| 5447 |
switch (hparams.n_layer) {
|
|
|
|
| 5448 |
case 22: model.type = e_model::MODEL_1B; break;
|
| 5449 |
case 26: model.type = e_model::MODEL_3B; break;
|
|
|
|
| 5450 |
// granite uses a vocab with len 49152
|
| 5451 |
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
| 5452 |
case 36: model.type = e_model::MODEL_8B; break; // granite
|
|
@@ -5559,11 +5618,11 @@ static void llm_load_hparams(
|
|
| 5559 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 5560 |
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
| 5561 |
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
| 5562 |
-
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
| 5563 |
hparams.f_max_alibi_bias = 8.0f;
|
| 5564 |
|
| 5565 |
switch (hparams.n_layer) {
|
| 5566 |
-
case 4:
|
| 5567 |
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
| 5568 |
}
|
| 5569 |
} break;
|
|
@@ -6048,6 +6107,7 @@ static void llm_load_hparams(
|
|
| 6048 |
}
|
| 6049 |
} break;
|
| 6050 |
case LLM_ARCH_GRANITE:
|
|
|
|
| 6051 |
{
|
| 6052 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 6053 |
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
@@ -6056,11 +6116,24 @@ static void llm_load_hparams(
|
|
| 6056 |
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
| 6057 |
|
| 6058 |
switch (hparams.n_layer) {
|
|
|
|
| 6059 |
case 40: model.type = e_model::MODEL_3B; break;
|
| 6060 |
// Add additional layer/vocab/etc checks here for other model sizes
|
| 6061 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 6062 |
}
|
| 6063 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6064 |
default: (void)0;
|
| 6065 |
}
|
| 6066 |
|
|
@@ -6254,6 +6327,7 @@ static void llm_load_vocab(
|
|
| 6254 |
tokenizer_pre == "phi-2" ||
|
| 6255 |
tokenizer_pre == "jina-es" ||
|
| 6256 |
tokenizer_pre == "jina-de" ||
|
|
|
|
| 6257 |
tokenizer_pre == "jina-v2-es" ||
|
| 6258 |
tokenizer_pre == "jina-v2-de" ||
|
| 6259 |
tokenizer_pre == "jina-v2-code") {
|
|
@@ -6318,6 +6392,11 @@ static void llm_load_vocab(
|
|
| 6318 |
} else if (
|
| 6319 |
tokenizer_pre == "exaone") {
|
| 6320 |
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6321 |
} else {
|
| 6322 |
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
| 6323 |
}
|
|
@@ -6375,7 +6454,12 @@ static void llm_load_vocab(
|
|
| 6375 |
|
| 6376 |
for (uint32_t i = 0; i < n_vocab; i++) {
|
| 6377 |
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
| 6378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6379 |
|
| 6380 |
vocab.token_to_id[word] = i;
|
| 6381 |
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
|
@@ -6400,6 +6484,8 @@ static void llm_load_vocab(
|
|
| 6400 |
}
|
| 6401 |
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
| 6402 |
|
|
|
|
|
|
|
| 6403 |
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
| 6404 |
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
| 6405 |
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
|
@@ -6454,8 +6540,14 @@ static void llm_load_vocab(
|
|
| 6454 |
vocab.linefeed_id = ids[0];
|
| 6455 |
} else {
|
| 6456 |
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
| 6457 |
-
|
| 6458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6459 |
}
|
| 6460 |
|
| 6461 |
// special tokens
|
|
@@ -6810,7 +6902,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
| 6810 |
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
| 6811 |
}
|
| 6812 |
|
| 6813 |
-
if (model.arch == LLM_ARCH_GRANITE) {
|
| 6814 |
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
| 6815 |
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
| 6816 |
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
@@ -6984,6 +7076,7 @@ static bool llm_load_tensors(
|
|
| 6984 |
case LLM_ARCH_REFACT:
|
| 6985 |
case LLM_ARCH_MINICPM:
|
| 6986 |
case LLM_ARCH_GRANITE:
|
|
|
|
| 6987 |
{
|
| 6988 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 6989 |
|
|
@@ -7327,6 +7420,12 @@ static bool llm_load_tensors(
|
|
| 7327 |
|
| 7328 |
if (model.arch == LLM_ARCH_BERT) {
|
| 7329 |
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7330 |
}
|
| 7331 |
|
| 7332 |
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
|
@@ -7379,6 +7478,8 @@ static bool llm_load_tensors(
|
|
| 7379 |
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
|
| 7380 |
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
|
| 7381 |
|
|
|
|
|
|
|
| 7382 |
for (int i = 0; i < n_layer; ++i) {
|
| 7383 |
ggml_context * ctx_layer = ctx_for_layer(i);
|
| 7384 |
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
@@ -8704,6 +8805,45 @@ static bool llm_load_tensors(
|
|
| 8704 |
}
|
| 8705 |
|
| 8706 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8707 |
default:
|
| 8708 |
throw std::runtime_error("unknown architecture");
|
| 8709 |
}
|
|
@@ -10173,6 +10313,10 @@ struct llm_build_context {
|
|
| 10173 |
struct ggml_tensor * cur;
|
| 10174 |
|
| 10175 |
switch (pooling_type) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10176 |
case LLAMA_POOLING_TYPE_MEAN:
|
| 10177 |
{
|
| 10178 |
struct ggml_tensor * inp_mean = build_inp_mean();
|
|
@@ -10184,9 +10328,26 @@ struct llm_build_context {
|
|
| 10184 |
struct ggml_tensor * inp_cls = build_inp_cls();
|
| 10185 |
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
| 10186 |
} break;
|
| 10187 |
-
case
|
| 10188 |
{
|
| 10189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10190 |
} break;
|
| 10191 |
default:
|
| 10192 |
{
|
|
@@ -11415,8 +11576,8 @@ struct llm_build_context {
|
|
| 11415 |
inpL = cur;
|
| 11416 |
}
|
| 11417 |
|
| 11418 |
-
// final output
|
| 11419 |
cur = inpL;
|
|
|
|
| 11420 |
cb(cur, "result_embd", -1);
|
| 11421 |
|
| 11422 |
ggml_build_forward_expand(gf, cur);
|
|
@@ -15848,6 +16009,184 @@ struct llm_build_context {
|
|
| 15848 |
|
| 15849 |
return gf;
|
| 15850 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15851 |
};
|
| 15852 |
|
| 15853 |
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
|
@@ -15930,6 +16269,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 15930 |
switch (model.arch) {
|
| 15931 |
case LLM_ARCH_LLAMA:
|
| 15932 |
case LLM_ARCH_GRANITE:
|
|
|
|
| 15933 |
{
|
| 15934 |
result = llm.build_llama();
|
| 15935 |
} break;
|
|
@@ -16107,6 +16447,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 16107 |
{
|
| 16108 |
result = llm.build_rwkv6();
|
| 16109 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16110 |
default:
|
| 16111 |
GGML_ABORT("fatal error");
|
| 16112 |
}
|
|
@@ -16393,7 +16737,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
|
| 16393 |
}
|
| 16394 |
}
|
| 16395 |
|
| 16396 |
-
if (cparams.embeddings &&
|
|
|
|
|
|
|
| 16397 |
const int64_t n_tokens = batch.n_tokens;
|
| 16398 |
const int64_t n_seq_tokens = batch.n_seq_tokens;
|
| 16399 |
const int64_t n_seqs = batch.n_seqs;
|
|
@@ -16408,7 +16754,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
|
| 16408 |
const llama_seq_id seq_id = batch.seq_id[s][0];
|
| 16409 |
|
| 16410 |
// TODO: adapt limits to n_seqs when batch.equal_seqs is true
|
| 16411 |
-
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
|
| 16412 |
|
| 16413 |
for (int i = 0; i < n_seq_tokens; ++i) {
|
| 16414 |
const llama_pos pos = batch.pos[s*n_seq_tokens + i];
|
|
@@ -16679,12 +17025,6 @@ static void llama_graph_compute(
|
|
| 16679 |
ggml_cgraph * gf,
|
| 16680 |
int n_threads,
|
| 16681 |
ggml_threadpool * threadpool) {
|
| 16682 |
-
#ifdef GGML_USE_METAL
|
| 16683 |
-
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
| 16684 |
-
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
| 16685 |
-
}
|
| 16686 |
-
#endif
|
| 16687 |
-
|
| 16688 |
if (lctx.backend_cpu != nullptr) {
|
| 16689 |
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
| 16690 |
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
|
@@ -16948,6 +17288,20 @@ static int llama_decode_internal(
|
|
| 16948 |
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
| 16949 |
}
|
| 16950 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16951 |
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
| 16952 |
{
|
| 16953 |
GGML_ABORT("unknown pooling type");
|
|
@@ -17154,6 +17508,13 @@ static int llama_encode_internal(
|
|
| 17154 |
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
| 17155 |
}
|
| 17156 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17157 |
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
| 17158 |
{
|
| 17159 |
GGML_ABORT("unknown pooling type");
|
|
@@ -19231,6 +19592,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
| 19231 |
case LLM_ARCH_DEEPSEEK2:
|
| 19232 |
case LLM_ARCH_CHATGLM:
|
| 19233 |
case LLM_ARCH_GRANITE:
|
|
|
|
|
|
|
| 19234 |
return LLAMA_ROPE_TYPE_NORM;
|
| 19235 |
|
| 19236 |
// the pairs of head values are offset by n_rot/2
|
|
|
|
| 215 |
LLM_ARCH_EXAONE,
|
| 216 |
LLM_ARCH_RWKV6,
|
| 217 |
LLM_ARCH_GRANITE,
|
| 218 |
+
LLM_ARCH_GRANITE_MOE,
|
| 219 |
+
LLM_ARCH_CHAMELEON,
|
| 220 |
LLM_ARCH_UNKNOWN,
|
| 221 |
};
|
| 222 |
|
|
|
|
| 268 |
{ LLM_ARCH_EXAONE, "exaone" },
|
| 269 |
{ LLM_ARCH_RWKV6, "rwkv6" },
|
| 270 |
{ LLM_ARCH_GRANITE, "granite" },
|
| 271 |
+
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
| 272 |
+
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
| 273 |
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
| 274 |
};
|
| 275 |
|
|
|
|
| 306 |
LLM_KV_DECODER_START_TOKEN_ID,
|
| 307 |
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
| 308 |
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
| 309 |
+
LLM_KV_SWIN_NORM,
|
| 310 |
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
| 311 |
LLM_KV_TIME_MIX_EXTRA_DIM,
|
| 312 |
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
|
|
|
| 414 |
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
| 415 |
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
| 416 |
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
| 417 |
+
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
|
| 418 |
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
| 419 |
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
|
| 420 |
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
|
|
|
| 606 |
LLM_TENSOR_ENC_FFN_DOWN,
|
| 607 |
LLM_TENSOR_ENC_FFN_UP,
|
| 608 |
LLM_TENSOR_ENC_OUTPUT_NORM,
|
| 609 |
+
LLM_TENSOR_CLS,
|
| 610 |
+
LLM_TENSOR_CLS_OUT,
|
| 611 |
};
|
| 612 |
|
| 613 |
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
|
|
|
| 795 |
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
| 796 |
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 797 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 798 |
+
{ LLM_TENSOR_CLS, "cls" },
|
| 799 |
+
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
| 800 |
},
|
| 801 |
},
|
| 802 |
{
|
|
|
|
| 832 |
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 833 |
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 834 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 835 |
+
{ LLM_TENSOR_CLS, "cls" },
|
| 836 |
},
|
| 837 |
},
|
| 838 |
{
|
|
|
|
| 1478 |
{
|
| 1479 |
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1480 |
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 1481 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 1482 |
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1483 |
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 1484 |
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
|
|
| 1490 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 1491 |
},
|
| 1492 |
},
|
| 1493 |
+
{
|
| 1494 |
+
LLM_ARCH_GRANITE_MOE,
|
| 1495 |
+
{
|
| 1496 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1497 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 1498 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 1499 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1500 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 1501 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 1502 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 1503 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 1504 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 1505 |
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
| 1506 |
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
| 1507 |
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 1508 |
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 1509 |
+
},
|
| 1510 |
+
},
|
| 1511 |
+
{
|
| 1512 |
+
LLM_ARCH_CHAMELEON,
|
| 1513 |
+
{
|
| 1514 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1515 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 1516 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 1517 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1518 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 1519 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 1520 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 1521 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 1522 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 1523 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 1524 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 1525 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 1526 |
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
| 1527 |
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
| 1528 |
+
},
|
| 1529 |
+
},
|
| 1530 |
{
|
| 1531 |
LLM_ARCH_UNKNOWN,
|
| 1532 |
{
|
|
|
|
| 2390 |
bool vocab_only;
|
| 2391 |
bool rope_finetuned;
|
| 2392 |
bool use_par_res;
|
| 2393 |
+
bool swin_norm;
|
| 2394 |
|
| 2395 |
uint32_t n_vocab;
|
| 2396 |
uint32_t n_ctx_train; // context size the model was trained on
|
|
|
|
| 2446 |
float f_max_alibi_bias = 0.0f;
|
| 2447 |
float f_logit_scale = 0.0f;
|
| 2448 |
|
| 2449 |
+
// Additional scale factors (Granite/Granite MoE)
|
| 2450 |
float f_residual_scale = 0.0f;
|
| 2451 |
float f_embedding_scale = 0.0f;
|
| 2452 |
float f_attention_scale = 0.0f;
|
|
|
|
| 2899 |
llama_hparams hparams = {};
|
| 2900 |
llama_vocab vocab;
|
| 2901 |
|
| 2902 |
+
// TODO: should init all tensors to nullptr
|
| 2903 |
struct ggml_tensor * tok_embd;
|
| 2904 |
struct ggml_tensor * type_embd;
|
| 2905 |
struct ggml_tensor * pos_embd;
|
|
|
|
| 2912 |
struct ggml_tensor * output_b;
|
| 2913 |
struct ggml_tensor * output_norm_enc;
|
| 2914 |
|
| 2915 |
+
// classifier
|
| 2916 |
+
struct ggml_tensor * cls;
|
| 2917 |
+
struct ggml_tensor * cls_b;
|
| 2918 |
+
struct ggml_tensor * cls_out = nullptr;
|
| 2919 |
+
struct ggml_tensor * cls_out_b = nullptr;
|
| 2920 |
+
|
| 2921 |
std::vector<llama_layer> layers;
|
| 2922 |
|
| 2923 |
llama_split_mode split_mode;
|
|
|
|
| 5502 |
}
|
| 5503 |
} else {
|
| 5504 |
switch (hparams.n_layer) {
|
| 5505 |
+
case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
|
| 5506 |
case 22: model.type = e_model::MODEL_1B; break;
|
| 5507 |
case 26: model.type = e_model::MODEL_3B; break;
|
| 5508 |
+
case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
|
| 5509 |
// granite uses a vocab with len 49152
|
| 5510 |
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
| 5511 |
case 36: model.type = e_model::MODEL_8B; break; // granite
|
|
|
|
| 5618 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 5619 |
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
| 5620 |
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
| 5621 |
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
| 5622 |
hparams.f_max_alibi_bias = 8.0f;
|
| 5623 |
|
| 5624 |
switch (hparams.n_layer) {
|
| 5625 |
+
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
| 5626 |
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
| 5627 |
}
|
| 5628 |
} break;
|
|
|
|
| 6107 |
}
|
| 6108 |
} break;
|
| 6109 |
case LLM_ARCH_GRANITE:
|
| 6110 |
+
case LLM_ARCH_GRANITE_MOE:
|
| 6111 |
{
|
| 6112 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 6113 |
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
|
|
| 6116 |
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
| 6117 |
|
| 6118 |
switch (hparams.n_layer) {
|
| 6119 |
+
case 32: model.type = e_model::MODEL_3B; break;
|
| 6120 |
case 40: model.type = e_model::MODEL_3B; break;
|
| 6121 |
// Add additional layer/vocab/etc checks here for other model sizes
|
| 6122 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 6123 |
}
|
| 6124 |
} break;
|
| 6125 |
+
case LLM_ARCH_CHAMELEON:
|
| 6126 |
+
{
|
| 6127 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 6128 |
+
hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
|
| 6129 |
+
ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
| 6130 |
+
|
| 6131 |
+
switch (hparams.n_layer) {
|
| 6132 |
+
case 32: model.type = e_model::MODEL_7B; break;
|
| 6133 |
+
case 48: model.type = e_model::MODEL_34B; break;
|
| 6134 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
| 6135 |
+
}
|
| 6136 |
+
} break;
|
| 6137 |
default: (void)0;
|
| 6138 |
}
|
| 6139 |
|
|
|
|
| 6327 |
tokenizer_pre == "phi-2" ||
|
| 6328 |
tokenizer_pre == "jina-es" ||
|
| 6329 |
tokenizer_pre == "jina-de" ||
|
| 6330 |
+
tokenizer_pre == "jina-v1-en" ||
|
| 6331 |
tokenizer_pre == "jina-v2-es" ||
|
| 6332 |
tokenizer_pre == "jina-v2-de" ||
|
| 6333 |
tokenizer_pre == "jina-v2-code") {
|
|
|
|
| 6392 |
} else if (
|
| 6393 |
tokenizer_pre == "exaone") {
|
| 6394 |
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
| 6395 |
+
} else if (
|
| 6396 |
+
tokenizer_pre == "chameleon") {
|
| 6397 |
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
| 6398 |
+
vocab.tokenizer_add_bos = true;
|
| 6399 |
+
vocab.tokenizer_clean_spaces = false;
|
| 6400 |
} else {
|
| 6401 |
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
| 6402 |
}
|
|
|
|
| 6454 |
|
| 6455 |
for (uint32_t i = 0; i < n_vocab; i++) {
|
| 6456 |
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
| 6457 |
+
|
| 6458 |
+
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
| 6459 |
+
if (word.empty()) {
|
| 6460 |
+
LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
|
| 6461 |
+
word = "[EMPTY_" + std::to_string(i) + "]";
|
| 6462 |
+
}
|
| 6463 |
|
| 6464 |
vocab.token_to_id[word] = i;
|
| 6465 |
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
|
|
|
| 6484 |
}
|
| 6485 |
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
| 6486 |
|
| 6487 |
+
vocab.init_tokenizer();
|
| 6488 |
+
|
| 6489 |
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
| 6490 |
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
| 6491 |
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
|
|
|
| 6540 |
vocab.linefeed_id = ids[0];
|
| 6541 |
} else {
|
| 6542 |
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
| 6543 |
+
|
| 6544 |
+
//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
| 6545 |
+
if (ids.empty()) {
|
| 6546 |
+
LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
|
| 6547 |
+
vocab.linefeed_id = vocab.special_pad_id;
|
| 6548 |
+
} else {
|
| 6549 |
+
vocab.linefeed_id = ids[0];
|
| 6550 |
+
}
|
| 6551 |
}
|
| 6552 |
|
| 6553 |
// special tokens
|
|
|
|
| 6902 |
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
| 6903 |
}
|
| 6904 |
|
| 6905 |
+
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
| 6906 |
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
| 6907 |
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
| 6908 |
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
|
|
| 7076 |
case LLM_ARCH_REFACT:
|
| 7077 |
case LLM_ARCH_MINICPM:
|
| 7078 |
case LLM_ARCH_GRANITE:
|
| 7079 |
+
case LLM_ARCH_GRANITE_MOE:
|
| 7080 |
{
|
| 7081 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 7082 |
|
|
|
|
| 7420 |
|
| 7421 |
if (model.arch == LLM_ARCH_BERT) {
|
| 7422 |
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
|
| 7423 |
+
|
| 7424 |
+
model.cls = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
| 7425 |
+
model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
| 7426 |
+
|
| 7427 |
+
model.cls_out = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
| 7428 |
+
model.cls_out_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
| 7429 |
}
|
| 7430 |
|
| 7431 |
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
|
|
|
| 7478 |
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
|
| 7479 |
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
|
| 7480 |
|
| 7481 |
+
model.cls = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
| 7482 |
+
model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
| 7483 |
for (int i = 0; i < n_layer; ++i) {
|
| 7484 |
ggml_context * ctx_layer = ctx_for_layer(i);
|
| 7485 |
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
|
|
| 8805 |
}
|
| 8806 |
|
| 8807 |
} break;
|
| 8808 |
+
case LLM_ARCH_CHAMELEON:
|
| 8809 |
+
{
|
| 8810 |
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 8811 |
+
|
| 8812 |
+
// output
|
| 8813 |
+
{
|
| 8814 |
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 8815 |
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
| 8816 |
+
|
| 8817 |
+
// if output is NULL, init from the input tok embed
|
| 8818 |
+
if (model.output == NULL) {
|
| 8819 |
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
| 8820 |
+
}
|
| 8821 |
+
}
|
| 8822 |
+
|
| 8823 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 8824 |
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
| 8825 |
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
| 8826 |
+
|
| 8827 |
+
auto & layer = model.layers[i];
|
| 8828 |
+
|
| 8829 |
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 8830 |
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
|
| 8831 |
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
|
| 8832 |
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
| 8833 |
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
| 8834 |
+
|
| 8835 |
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
| 8836 |
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
| 8837 |
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
| 8838 |
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 8839 |
+
|
| 8840 |
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 8841 |
+
|
| 8842 |
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
| 8843 |
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
| 8844 |
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 8845 |
+
}
|
| 8846 |
+
} break;
|
| 8847 |
default:
|
| 8848 |
throw std::runtime_error("unknown architecture");
|
| 8849 |
}
|
|
|
|
| 10313 |
struct ggml_tensor * cur;
|
| 10314 |
|
| 10315 |
switch (pooling_type) {
|
| 10316 |
+
case LLAMA_POOLING_TYPE_NONE:
|
| 10317 |
+
{
|
| 10318 |
+
cur = inp;
|
| 10319 |
+
} break;
|
| 10320 |
case LLAMA_POOLING_TYPE_MEAN:
|
| 10321 |
{
|
| 10322 |
struct ggml_tensor * inp_mean = build_inp_mean();
|
|
|
|
| 10328 |
struct ggml_tensor * inp_cls = build_inp_cls();
|
| 10329 |
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
| 10330 |
} break;
|
| 10331 |
+
case LLAMA_POOLING_TYPE_RANK:
|
| 10332 |
{
|
| 10333 |
+
struct ggml_tensor * inp_cls = build_inp_cls();
|
| 10334 |
+
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
| 10335 |
+
|
| 10336 |
+
// classification head
|
| 10337 |
+
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
| 10338 |
+
GGML_ASSERT(model.cls != nullptr);
|
| 10339 |
+
GGML_ASSERT(model.cls_b != nullptr);
|
| 10340 |
+
|
| 10341 |
+
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
|
| 10342 |
+
cur = ggml_tanh(ctx0, cur);
|
| 10343 |
+
|
| 10344 |
+
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
| 10345 |
+
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
| 10346 |
+
if (model.cls_out) {
|
| 10347 |
+
GGML_ASSERT(model.cls_out_b != nullptr);
|
| 10348 |
+
|
| 10349 |
+
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
|
| 10350 |
+
}
|
| 10351 |
} break;
|
| 10352 |
default:
|
| 10353 |
{
|
|
|
|
| 11576 |
inpL = cur;
|
| 11577 |
}
|
| 11578 |
|
|
|
|
| 11579 |
cur = inpL;
|
| 11580 |
+
|
| 11581 |
cb(cur, "result_embd", -1);
|
| 11582 |
|
| 11583 |
ggml_build_forward_expand(gf, cur);
|
|
|
|
| 16009 |
|
| 16010 |
return gf;
|
| 16011 |
}
|
| 16012 |
+
|
| 16013 |
+
// ref: https://github.com/facebookresearch/chameleon
|
| 16014 |
+
// based on the original build_llama() function, changes:
|
| 16015 |
+
// * qk-norm
|
| 16016 |
+
// * swin-norm
|
| 16017 |
+
// * removed bias
|
| 16018 |
+
// * removed MoE
|
| 16019 |
+
struct ggml_cgraph * build_chameleon() {
|
| 16020 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 16021 |
+
|
| 16022 |
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
| 16023 |
+
int32_t n_tokens = this->n_tokens;
|
| 16024 |
+
|
| 16025 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 16026 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 16027 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 16028 |
+
|
| 16029 |
+
struct ggml_tensor * cur;
|
| 16030 |
+
struct ggml_tensor * inpL;
|
| 16031 |
+
|
| 16032 |
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
| 16033 |
+
|
| 16034 |
+
// inp_pos - contains the positions
|
| 16035 |
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
| 16036 |
+
|
| 16037 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 16038 |
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
| 16039 |
+
|
| 16040 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 16041 |
+
struct ggml_tensor * inpSA = inpL;
|
| 16042 |
+
|
| 16043 |
+
// norm
|
| 16044 |
+
if (hparams.swin_norm) {
|
| 16045 |
+
cur = inpL;
|
| 16046 |
+
} else {
|
| 16047 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 16048 |
+
model.layers[il].attn_norm, NULL,
|
| 16049 |
+
LLM_NORM_RMS, cb, il);
|
| 16050 |
+
cb(cur, "attn_norm", il);
|
| 16051 |
+
}
|
| 16052 |
+
|
| 16053 |
+
// self-attention
|
| 16054 |
+
{
|
| 16055 |
+
// compute Q and K and RoPE them
|
| 16056 |
+
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
| 16057 |
+
cb(Qcur, "Qcur", il);
|
| 16058 |
+
|
| 16059 |
+
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
| 16060 |
+
cb(Kcur, "Kcur", il);
|
| 16061 |
+
|
| 16062 |
+
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
| 16063 |
+
cb(Vcur, "Vcur", il);
|
| 16064 |
+
|
| 16065 |
+
if (model.layers[il].attn_q_norm) {
|
| 16066 |
+
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
| 16067 |
+
ggml_element_size(Qcur) * n_embd_head,
|
| 16068 |
+
ggml_element_size(Qcur) * n_embd_head * n_head,
|
| 16069 |
+
0);
|
| 16070 |
+
cb(Qcur, "Qcur", il);
|
| 16071 |
+
|
| 16072 |
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
| 16073 |
+
model.layers[il].attn_q_norm,
|
| 16074 |
+
model.layers[il].attn_q_norm_b,
|
| 16075 |
+
LLM_NORM, cb, il);
|
| 16076 |
+
cb(Qcur, "Qcur", il);
|
| 16077 |
+
}
|
| 16078 |
+
|
| 16079 |
+
if (model.layers[il].attn_k_norm) {
|
| 16080 |
+
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
| 16081 |
+
ggml_element_size(Kcur) * n_embd_head,
|
| 16082 |
+
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
| 16083 |
+
0);
|
| 16084 |
+
cb(Kcur, "Kcur", il);
|
| 16085 |
+
|
| 16086 |
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
| 16087 |
+
model.layers[il].attn_k_norm,
|
| 16088 |
+
model.layers[il].attn_k_norm_b,
|
| 16089 |
+
LLM_NORM, cb, il);
|
| 16090 |
+
cb(Kcur, "Kcur", il);
|
| 16091 |
+
}
|
| 16092 |
+
|
| 16093 |
+
Qcur = ggml_rope_ext(
|
| 16094 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
| 16095 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 16096 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 16097 |
+
);
|
| 16098 |
+
cb(Qcur, "Qcur", il);
|
| 16099 |
+
|
| 16100 |
+
Kcur = ggml_rope_ext(
|
| 16101 |
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
| 16102 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 16103 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 16104 |
+
);
|
| 16105 |
+
cb(Kcur, "Kcur", il);
|
| 16106 |
+
|
| 16107 |
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
| 16108 |
+
model.layers[il].wo, nullptr,
|
| 16109 |
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 16110 |
+
|
| 16111 |
+
if (hparams.swin_norm) {
|
| 16112 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 16113 |
+
model.layers[il].attn_norm, NULL,
|
| 16114 |
+
LLM_NORM_RMS, cb, il);
|
| 16115 |
+
}
|
| 16116 |
+
}
|
| 16117 |
+
|
| 16118 |
+
if (il == n_layer - 1) {
|
| 16119 |
+
// skip computing output for unused tokens
|
| 16120 |
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 16121 |
+
n_tokens = n_outputs;
|
| 16122 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 16123 |
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 16124 |
+
}
|
| 16125 |
+
|
| 16126 |
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 16127 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 16128 |
+
|
| 16129 |
+
// feed-forward network
|
| 16130 |
+
if (!hparams.swin_norm) {
|
| 16131 |
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 16132 |
+
model.layers[il].ffn_norm, NULL,
|
| 16133 |
+
LLM_NORM_RMS, cb, il);
|
| 16134 |
+
cb(cur, "ffn_norm", il);
|
| 16135 |
+
}
|
| 16136 |
+
|
| 16137 |
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
| 16138 |
+
model.layers[il].ffn_up, NULL, NULL,
|
| 16139 |
+
model.layers[il].ffn_gate, NULL, NULL,
|
| 16140 |
+
model.layers[il].ffn_down, NULL, NULL,
|
| 16141 |
+
NULL,
|
| 16142 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 16143 |
+
cb(cur, "ffn_out", il);
|
| 16144 |
+
|
| 16145 |
+
if (hparams.swin_norm) {
|
| 16146 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 16147 |
+
model.layers[il].ffn_norm, NULL,
|
| 16148 |
+
LLM_NORM_RMS, cb, il);
|
| 16149 |
+
cb(cur, "ffn_norm", il);
|
| 16150 |
+
}
|
| 16151 |
+
|
| 16152 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 16153 |
+
cb(cur, "ffn_out", il);
|
| 16154 |
+
|
| 16155 |
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
| 16156 |
+
cb(cur, "l_out", il);
|
| 16157 |
+
|
| 16158 |
+
// input for next layer
|
| 16159 |
+
inpL = cur;
|
| 16160 |
+
}
|
| 16161 |
+
|
| 16162 |
+
cur = inpL;
|
| 16163 |
+
|
| 16164 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 16165 |
+
model.output_norm, NULL,
|
| 16166 |
+
LLM_NORM_RMS, cb, -1);
|
| 16167 |
+
cb(cur, "result_norm", -1);
|
| 16168 |
+
|
| 16169 |
+
// lm_head
|
| 16170 |
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
| 16171 |
+
cb(cur, "result_output_with_img_logits", -1);
|
| 16172 |
+
|
| 16173 |
+
// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
|
| 16174 |
+
// Needs to be removed once image outputs are supported.
|
| 16175 |
+
int img_token_end_idx = 8196;
|
| 16176 |
+
int img_token_start_idx = 4;
|
| 16177 |
+
int num_img_tokens = img_token_end_idx - img_token_start_idx;
|
| 16178 |
+
// creates 1d tensor of size num_img_tokens and values -FLT_MAX,
|
| 16179 |
+
// which ensures that text token values are always at least larger than image token values
|
| 16180 |
+
struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
|
| 16181 |
+
img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
|
| 16182 |
+
cb(img_logits, "img_logits", -1);
|
| 16183 |
+
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
|
| 16184 |
+
cb(cur, "result_output", -1);
|
| 16185 |
+
|
| 16186 |
+
ggml_build_forward_expand(gf, cur);
|
| 16187 |
+
|
| 16188 |
+
return gf;
|
| 16189 |
+
}
|
| 16190 |
};
|
| 16191 |
|
| 16192 |
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
|
|
|
| 16269 |
switch (model.arch) {
|
| 16270 |
case LLM_ARCH_LLAMA:
|
| 16271 |
case LLM_ARCH_GRANITE:
|
| 16272 |
+
case LLM_ARCH_GRANITE_MOE:
|
| 16273 |
{
|
| 16274 |
result = llm.build_llama();
|
| 16275 |
} break;
|
|
|
|
| 16447 |
{
|
| 16448 |
result = llm.build_rwkv6();
|
| 16449 |
} break;
|
| 16450 |
+
case LLM_ARCH_CHAMELEON:
|
| 16451 |
+
{
|
| 16452 |
+
result = llm.build_chameleon();
|
| 16453 |
+
} break;
|
| 16454 |
default:
|
| 16455 |
GGML_ABORT("fatal error");
|
| 16456 |
}
|
|
|
|
| 16737 |
}
|
| 16738 |
}
|
| 16739 |
|
| 16740 |
+
if (cparams.embeddings && (
|
| 16741 |
+
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
|
| 16742 |
+
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
|
| 16743 |
const int64_t n_tokens = batch.n_tokens;
|
| 16744 |
const int64_t n_seq_tokens = batch.n_seq_tokens;
|
| 16745 |
const int64_t n_seqs = batch.n_seqs;
|
|
|
|
| 16754 |
const llama_seq_id seq_id = batch.seq_id[s][0];
|
| 16755 |
|
| 16756 |
// TODO: adapt limits to n_seqs when batch.equal_seqs is true
|
| 16757 |
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
|
| 16758 |
|
| 16759 |
for (int i = 0; i < n_seq_tokens; ++i) {
|
| 16760 |
const llama_pos pos = batch.pos[s*n_seq_tokens + i];
|
|
|
|
| 17025 |
ggml_cgraph * gf,
|
| 17026 |
int n_threads,
|
| 17027 |
ggml_threadpool * threadpool) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17028 |
if (lctx.backend_cpu != nullptr) {
|
| 17029 |
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
| 17030 |
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
|
|
|
| 17288 |
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
| 17289 |
}
|
| 17290 |
} break;
|
| 17291 |
+
case LLAMA_POOLING_TYPE_RANK:
|
| 17292 |
+
{
|
| 17293 |
+
// extract the rerank score - a single float per sequence
|
| 17294 |
+
auto & embd_seq_out = lctx.embd_seq;
|
| 17295 |
+
|
| 17296 |
+
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
| 17297 |
+
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
| 17298 |
+
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
| 17299 |
+
continue;
|
| 17300 |
+
}
|
| 17301 |
+
embd_seq_out[seq_id].resize(1);
|
| 17302 |
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
|
| 17303 |
+
}
|
| 17304 |
+
} break;
|
| 17305 |
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
| 17306 |
{
|
| 17307 |
GGML_ABORT("unknown pooling type");
|
|
|
|
| 17508 |
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
| 17509 |
}
|
| 17510 |
} break;
|
| 17511 |
+
case LLAMA_POOLING_TYPE_RANK:
|
| 17512 |
+
{
|
| 17513 |
+
// TODO: this likely should be the same logic as in llama_decoder_internal, but better to
|
| 17514 |
+
// wait for an encoder model that requires this pooling type in order to test it
|
| 17515 |
+
// https://github.com/ggerganov/llama.cpp/pull/9510
|
| 17516 |
+
GGML_ABORT("RANK pooling not implemented yet");
|
| 17517 |
+
}
|
| 17518 |
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
| 17519 |
{
|
| 17520 |
GGML_ABORT("unknown pooling type");
|
|
|
|
| 19592 |
case LLM_ARCH_DEEPSEEK2:
|
| 19593 |
case LLM_ARCH_CHATGLM:
|
| 19594 |
case LLM_ARCH_GRANITE:
|
| 19595 |
+
case LLM_ARCH_GRANITE_MOE:
|
| 19596 |
+
case LLM_ARCH_CHAMELEON:
|
| 19597 |
return LLAMA_ROPE_TYPE_NORM;
|
| 19598 |
|
| 19599 |
// the pairs of head values are offset by n_rot/2
|
examples/talk-llama/llama.h
CHANGED
|
@@ -102,6 +102,7 @@ extern "C" {
|
|
| 102 |
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
| 103 |
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
| 104 |
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
|
|
| 105 |
};
|
| 106 |
|
| 107 |
enum llama_rope_type {
|
|
@@ -192,6 +193,7 @@ extern "C" {
|
|
| 192 |
LLAMA_POOLING_TYPE_MEAN = 1,
|
| 193 |
LLAMA_POOLING_TYPE_CLS = 2,
|
| 194 |
LLAMA_POOLING_TYPE_LAST = 3,
|
|
|
|
| 195 |
};
|
| 196 |
|
| 197 |
enum llama_attention_type {
|
|
@@ -201,9 +203,9 @@ extern "C" {
|
|
| 201 |
};
|
| 202 |
|
| 203 |
enum llama_split_mode {
|
| 204 |
-
LLAMA_SPLIT_MODE_NONE
|
| 205 |
-
LLAMA_SPLIT_MODE_LAYER
|
| 206 |
-
LLAMA_SPLIT_MODE_ROW
|
| 207 |
};
|
| 208 |
|
| 209 |
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
|
@@ -871,7 +873,8 @@ extern "C" {
|
|
| 871 |
|
| 872 |
// Get the embeddings for a sequence id
|
| 873 |
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
| 874 |
-
//
|
|
|
|
| 875 |
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
| 876 |
|
| 877 |
//
|
|
@@ -910,6 +913,8 @@ extern "C" {
|
|
| 910 |
//
|
| 911 |
// Tokenization
|
| 912 |
//
|
|
|
|
|
|
|
| 913 |
|
| 914 |
/// @details Convert the provided text into tokens.
|
| 915 |
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
|
|
|
| 102 |
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
| 103 |
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
| 104 |
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
| 105 |
+
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
| 106 |
};
|
| 107 |
|
| 108 |
enum llama_rope_type {
|
|
|
|
| 193 |
LLAMA_POOLING_TYPE_MEAN = 1,
|
| 194 |
LLAMA_POOLING_TYPE_CLS = 2,
|
| 195 |
LLAMA_POOLING_TYPE_LAST = 3,
|
| 196 |
+
LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
|
| 197 |
};
|
| 198 |
|
| 199 |
enum llama_attention_type {
|
|
|
|
| 203 |
};
|
| 204 |
|
| 205 |
enum llama_split_mode {
|
| 206 |
+
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
| 207 |
+
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
| 208 |
+
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
| 209 |
};
|
| 210 |
|
| 211 |
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
|
|
|
| 873 |
|
| 874 |
// Get the embeddings for a sequence id
|
| 875 |
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
| 876 |
+
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
|
| 877 |
+
// otherwise: float[n_embd] (1-dimensional)
|
| 878 |
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
| 879 |
|
| 880 |
//
|
|
|
|
| 913 |
//
|
| 914 |
// Tokenization
|
| 915 |
//
|
| 916 |
+
// The API is thread-safe.
|
| 917 |
+
//
|
| 918 |
|
| 919 |
/// @details Convert the provided text into tokens.
|
| 920 |
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
src/whisper.cpp
CHANGED
|
@@ -204,11 +204,6 @@ static bool ggml_graph_compute_helper(
|
|
| 204 |
if (ggml_backend_is_blas(backend)) {
|
| 205 |
ggml_backend_blas_set_n_threads(backend, n_threads);
|
| 206 |
}
|
| 207 |
-
#endif
|
| 208 |
-
#ifdef GGML_USE_METAL
|
| 209 |
-
if (ggml_backend_is_metal(backend)) {
|
| 210 |
-
ggml_backend_metal_set_n_cb(backend, n_threads);
|
| 211 |
-
}
|
| 212 |
#endif
|
| 213 |
}
|
| 214 |
|
|
|
|
| 204 |
if (ggml_backend_is_blas(backend)) {
|
| 205 |
ggml_backend_blas_set_n_threads(backend, n_threads);
|
| 206 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
#endif
|
| 208 |
}
|
| 209 |
|