ggerganov commited on
Commit
7ef5ccc
·
1 Parent(s): 32cfce9

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/llama.h CHANGED
@@ -67,6 +67,7 @@ extern "C" {
67
  LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
68
  LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
69
  LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
 
70
  };
71
 
72
  // pre-tokenization types
@@ -87,6 +88,10 @@ extern "C" {
87
  LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
88
  LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
89
  LLAMA_VOCAB_PRE_TYPE_PORO = 15,
 
 
 
 
90
  };
91
 
92
  // note: these values should be synchronized with ggml_rope
@@ -177,6 +182,12 @@ extern "C" {
177
  LLAMA_POOLING_TYPE_LAST = 3,
178
  };
179
 
 
 
 
 
 
 
180
  enum llama_split_mode {
181
  LLAMA_SPLIT_MODE_NONE = 0, // single GPU
182
  LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -294,6 +305,7 @@ extern "C" {
294
 
295
  enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
296
  enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
 
297
 
298
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
299
  float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -482,6 +494,13 @@ extern "C" {
482
  // Get a llama model tensor
483
  LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
484
 
 
 
 
 
 
 
 
485
  // Returns 0 on success
486
  LLAMA_API uint32_t llama_model_quantize(
487
  const char * fname_inp,
@@ -767,6 +786,14 @@ extern "C" {
767
  // Frees a batch of tokens allocated with llama_batch_init()
768
  LLAMA_API void llama_batch_free(struct llama_batch batch);
769
 
 
 
 
 
 
 
 
 
770
  // Positive return values does not mean a fatal error, but rather a warning.
771
  // 0 - success
772
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@@ -857,6 +884,7 @@ extern "C" {
857
  LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
858
  LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
859
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
 
860
 
861
  // Returns -1 if unknown, 1 for true or 0 for false.
862
  LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
@@ -878,6 +906,7 @@ extern "C" {
878
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
879
  /// @return Returns the number of tokens on success, no more than n_tokens_max
880
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
 
881
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
882
  /// as plaintext. Does not insert a leading space.
883
  LLAMA_API int32_t llama_tokenize(
@@ -892,15 +921,31 @@ extern "C" {
892
  // Token Id -> Piece.
893
  // Uses the vocabulary in the provided context.
894
  // Does not write null terminator to the buffer.
895
- // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
896
  // @param special If true, special tokens are rendered in the output.
897
  LLAMA_API int32_t llama_token_to_piece(
898
  const struct llama_model * model,
899
  llama_token token,
900
  char * buf,
901
  int32_t length,
 
902
  bool special);
903
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
904
  /// Apply chat template. Inspired by hf apply_chat_template() on python.
905
  /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
906
  /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@@ -924,6 +969,12 @@ extern "C" {
924
  // Grammar
925
  //
926
 
 
 
 
 
 
 
927
  LLAMA_API struct llama_grammar * llama_grammar_init(
928
  const llama_grammar_element ** rules,
929
  size_t n_rules,
 
67
  LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
68
  LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
69
  LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
70
+ LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
71
  };
72
 
73
  // pre-tokenization types
 
88
  LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
89
  LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
90
  LLAMA_VOCAB_PRE_TYPE_PORO = 15,
91
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
92
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
93
+ LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
94
+ LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
95
  };
96
 
97
  // note: these values should be synchronized with ggml_rope
 
182
  LLAMA_POOLING_TYPE_LAST = 3,
183
  };
184
 
185
+ enum llama_attention_type {
186
+ LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
187
+ LLAMA_ATTENTION_TYPE_CAUSAL = 0,
188
+ LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
189
+ };
190
+
191
  enum llama_split_mode {
192
  LLAMA_SPLIT_MODE_NONE = 0, // single GPU
193
  LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
 
305
 
306
  enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
307
  enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
308
+ enum llama_attention_type attention_type; // attention type to use for embeddings
309
 
310
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
311
  float rope_freq_base; // RoPE base frequency, 0 = from model
 
494
  // Get a llama model tensor
495
  LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
496
 
497
+ // Returns true if the model contains an encoder that requires llama_encode() call
498
+ LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
499
+
500
+ // For encoder-decoder models, this function returns id of the token that must be provided
501
+ // to the decoder to start generating output sequence. For other models, it returns -1.
502
+ LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
503
+
504
  // Returns 0 on success
505
  LLAMA_API uint32_t llama_model_quantize(
506
  const char * fname_inp,
 
786
  // Frees a batch of tokens allocated with llama_batch_init()
787
  LLAMA_API void llama_batch_free(struct llama_batch batch);
788
 
789
+ // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
790
+ // Stores the encoder output internally for later use by the decoder cross-attention layers.
791
+ // 0 - success
792
+ // < 0 - error
793
+ LLAMA_API int32_t llama_encode(
794
+ struct llama_context * ctx,
795
+ struct llama_batch batch);
796
+
797
  // Positive return values does not mean a fatal error, but rather a warning.
798
  // 0 - success
799
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
 
884
  LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
885
  LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
886
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
887
+ LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
888
 
889
  // Returns -1 if unknown, 1 for true or 0 for false.
890
  LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
 
906
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
907
  /// @return Returns the number of tokens on success, no more than n_tokens_max
908
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
909
+ /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
910
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
911
  /// as plaintext. Does not insert a leading space.
912
  LLAMA_API int32_t llama_tokenize(
 
921
  // Token Id -> Piece.
922
  // Uses the vocabulary in the provided context.
923
  // Does not write null terminator to the buffer.
924
+ // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
925
  // @param special If true, special tokens are rendered in the output.
926
  LLAMA_API int32_t llama_token_to_piece(
927
  const struct llama_model * model,
928
  llama_token token,
929
  char * buf,
930
  int32_t length,
931
+ int32_t lstrip,
932
  bool special);
933
 
934
+ /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
935
+ /// @param text The char pointer must be large enough to hold the resulting text.
936
+ /// @return Returns the number of chars/bytes on success, no more than text_len_max.
937
+ /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
938
+ /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
939
+ /// @param unparse_special If true, special tokens are rendered in the output.
940
+ LLAMA_API int32_t llama_detokenize(
941
+ const struct llama_model * model,
942
+ const llama_token * tokens,
943
+ int32_t n_tokens,
944
+ char * text,
945
+ int32_t text_len_max,
946
+ bool remove_special,
947
+ bool unparse_special);
948
+
949
  /// Apply chat template. Inspired by hf apply_chat_template() on python.
950
  /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
951
  /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
 
969
  // Grammar
970
  //
971
 
972
+ /// Initialize a llama_grammar.
973
+ ///
974
+ /// @param rules The rule elements of the grammar to initialize.
975
+ /// @param n_rules The number of rules.
976
+ /// @param start_rule_index The index of the root rule (the starting point of the grammar).
977
+ /// @return The initialized llama_grammar or nullptr if initialization failed.
978
  LLAMA_API struct llama_grammar * llama_grammar_init(
979
  const llama_grammar_element ** rules,
980
  size_t n_rules,
examples/talk-llama/talk-llama.cpp CHANGED
@@ -35,10 +35,10 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
35
 
36
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
37
  std::vector<char> result(8, 0);
38
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
39
  if (n_tokens < 0) {
40
  result.resize(-n_tokens);
41
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
42
  GGML_ASSERT(check == -n_tokens);
43
  } else {
44
  result.resize(n_tokens);
 
35
 
36
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
37
  std::vector<char> result(8, 0);
38
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
39
  if (n_tokens < 0) {
40
  result.resize(-n_tokens);
41
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
42
  GGML_ASSERT(check == -n_tokens);
43
  } else {
44
  result.resize(n_tokens);
examples/talk-llama/unicode-data.cpp CHANGED
@@ -7030,4 +7030,3 @@ const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd
7030
  {0x02FA1C, 0x02FA1C, 0x009F3B},
7031
  {0x02FA1D, 0x02FA1D, 0x02A600},
7032
  };
7033
-
 
7030
  {0x02FA1C, 0x02FA1C, 0x009F3B},
7031
  {0x02FA1D, 0x02FA1D, 0x02A600},
7032
  };
 
examples/talk-llama/unicode.cpp CHANGED
@@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
23
  return result;
24
  }
25
 
26
- static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
27
  assert(offset < utf8.size());
28
  if (!(utf8[offset + 0] & 0x80)) {
29
  auto result = utf8[offset + 0];
@@ -232,8 +232,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
232
  };
233
 
234
  auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
235
- static const codepoint_flags undef(codepoint_flags::UNDEFINED);
236
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
237
  };
238
 
239
  size_t _prev_end = offset_ini;
@@ -295,9 +294,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
295
  continue;
296
  }
297
  // regex: <space>?[^\s\p{L}\p{N}]+
298
- if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
299
  pos += (cpt == ' ');
300
- while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
301
  flags2 = _get_flags(++pos);
302
  }
303
  _add_token(pos);
@@ -351,8 +350,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
351
  };
352
 
353
  auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
354
- static const codepoint_flags undef(codepoint_flags::UNDEFINED);
355
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
356
  };
357
 
358
  size_t _prev_end = offset_ini;
@@ -394,8 +392,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
394
  }
395
  }
396
 
397
- // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
398
- if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
399
  if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
400
  pos++;
401
  while (_get_flags(pos).is_letter) {
@@ -421,9 +419,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
421
 
422
  // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
423
  auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
424
- if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
425
  pos += (cpt == ' ');
426
- while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
427
  flags2 = _get_flags(++pos);
428
  }
429
  uint32_t cpt2 = _get_cpt(pos);
 
23
  return result;
24
  }
25
 
26
+ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
27
  assert(offset < utf8.size());
28
  if (!(utf8[offset + 0] & 0x80)) {
29
  auto result = utf8[offset + 0];
 
232
  };
233
 
234
  auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
235
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
 
236
  };
237
 
238
  size_t _prev_end = offset_ini;
 
294
  continue;
295
  }
296
  // regex: <space>?[^\s\p{L}\p{N}]+
297
+ if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
298
  pos += (cpt == ' ');
299
+ while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
300
  flags2 = _get_flags(++pos);
301
  }
302
  _add_token(pos);
 
350
  };
351
 
352
  auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
353
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
 
354
  };
355
 
356
  size_t _prev_end = offset_ini;
 
392
  }
393
  }
394
 
395
+ // regex: [^\r\n\p{L}\p{N}]?\p{L}+
396
+ if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
397
  if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
398
  pos++;
399
  while (_get_flags(pos).is_letter) {
 
419
 
420
  // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
421
  auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
422
+ if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
423
  pos += (cpt == ' ');
424
+ while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
425
  flags2 = _get_flags(++pos);
426
  }
427
  uint32_t cpt2 = _get_cpt(pos);
examples/talk-llama/unicode.h CHANGED
@@ -48,6 +48,7 @@ struct codepoint_flags {
48
 
49
 
50
  std::string unicode_cpt_to_utf8(uint32_t cp);
 
51
  std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
52
 
53
  std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
 
48
 
49
 
50
  std::string unicode_cpt_to_utf8(uint32_t cp);
51
+ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
52
  std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
53
 
54
  std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
src/whisper.cpp CHANGED
@@ -2949,7 +2949,7 @@ struct whisper_global_cache {
2949
  // Mel spectrogram
2950
 
2951
  void whisper_mel_init(whisper_mel & mel, ggml_backend_t backend, int n_len, int n_len_org, int n_mel) {
2952
- WHISPER_LOG_INFO("%s: n_len = %d, n_len_org = %d, n_mel = %d\n", __func__, n_len, n_len_org, n_mel);
2953
  mel.n_len_org = n_len_org;
2954
  assert(!mel.ctx);
2955
  mel.ctx = ggml_init({ggml_tensor_overhead(), nullptr, true});
 
2949
  // Mel spectrogram
2950
 
2951
  void whisper_mel_init(whisper_mel & mel, ggml_backend_t backend, int n_len, int n_len_org, int n_mel) {
2952
+ //WHISPER_LOG_INFO("%s: n_len = %d, n_len_org = %d, n_mel = %d\n", __func__, n_len, n_len_org, n_mel);
2953
  mel.n_len_org = n_len_org;
2954
  assert(!mel.ctx);
2955
  mel.ctx = ggml_init({ggml_tensor_overhead(), nullptr, true});