ggerganov commited on
Commit
e48ba5c
·
1 Parent(s): a265bfa

stream : add "max_tokens" parameter

Browse files

Used to limit the number of tokens in a segment.
Useful to battle with word repetition when using partial encoder context

Files changed (3) hide show
  1. examples/stream/stream.cpp +1 -0
  2. whisper.cpp +3 -1
  3. whisper.h +1 -1
examples/stream/stream.cpp CHANGED
@@ -322,6 +322,7 @@ int main(int argc, char ** argv) {
322
  {
323
  whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
324
 
 
325
  wparams.print_progress = false;
326
  wparams.print_special_tokens = params.print_special_tokens;
327
  wparams.print_realtime = false;
 
322
  {
323
  whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
324
 
325
+ wparams.max_tokens = 32;
326
  wparams.print_progress = false;
327
  wparams.print_special_tokens = params.print_special_tokens;
328
  wparams.print_realtime = false;
whisper.cpp CHANGED
@@ -2402,6 +2402,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2402
  /*.thold_pt =*/ 0.01f,
2403
  /*.thold_ptsum =*/ 0.01f,
2404
  /*.max_len =*/ 0,
 
2405
 
2406
  /*.speed_up =*/ false,
2407
 
@@ -2443,6 +2444,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2443
  /*.thold_pt =*/ 0.01f,
2444
  /*.thold_ptsum =*/ 0.01f,
2445
  /*.max_len =*/ 0,
 
2446
 
2447
  /*.speed_up =*/ false,
2448
 
@@ -2685,7 +2687,7 @@ int whisper_full(
2685
  //}
2686
 
2687
  // end of text token
2688
- if (token.id == whisper_token_eot(ctx) || (i > WHISPER_EXPERIMENT_MAX_TOKENS_PER_SEGMENT)) {
2689
  if (result_len == 0) {
2690
  if (seek + seek_delta + 100 >= seek_end) {
2691
  result_len = i + 1;
 
2402
  /*.thold_pt =*/ 0.01f,
2403
  /*.thold_ptsum =*/ 0.01f,
2404
  /*.max_len =*/ 0,
2405
+ /*.max_tokens =*/ 0,
2406
 
2407
  /*.speed_up =*/ false,
2408
 
 
2444
  /*.thold_pt =*/ 0.01f,
2445
  /*.thold_ptsum =*/ 0.01f,
2446
  /*.max_len =*/ 0,
2447
+ /*.max_tokens =*/ 0,
2448
 
2449
  /*.speed_up =*/ false,
2450
 
 
2687
  //}
2688
 
2689
  // end of text token
2690
+ if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) {
2691
  if (result_len == 0) {
2692
  if (seek + seek_delta + 100 >= seek_end) {
2693
  result_len = i + 1;
whisper.h CHANGED
@@ -25,7 +25,6 @@
25
  #define WHISPER_CHUNK_SIZE 30
26
 
27
  #define WHISPER_EXPERIMENT_AUDIO_CTX 512
28
- #define WHISPER_EXPERIMENT_MAX_TOKENS_PER_SEGMENT 32
29
 
30
  #ifdef __cplusplus
31
  extern "C" {
@@ -205,6 +204,7 @@ extern "C" {
205
  float thold_pt; // timestamp token probability threshold (~0.01)
206
  float thold_ptsum; // timestamp token sum probability threshold (~0.01)
207
  int max_len; // max segment length in characters
 
208
 
209
  // [EXPERIMENTAL] speed-up techniques
210
  bool speed_up; // speed-up the audio by 2x using Phase Vocoder
 
25
  #define WHISPER_CHUNK_SIZE 30
26
 
27
  #define WHISPER_EXPERIMENT_AUDIO_CTX 512
 
28
 
29
  #ifdef __cplusplus
30
  extern "C" {
 
204
  float thold_pt; // timestamp token probability threshold (~0.01)
205
  float thold_ptsum; // timestamp token sum probability threshold (~0.01)
206
  int max_len; // max segment length in characters
207
+ int max_tokens; // max tokens per segment (0 = no limit)
208
 
209
  // [EXPERIMENTAL] speed-up techniques
210
  bool speed_up; // speed-up the audio by 2x using Phase Vocoder