ggerganov commited on
Commit
28726dd
·
unverified ·
1 Parent(s): 8ad3dbf

stream : "-kc" now enables context keeping from previous segment (#90)

Browse files
Files changed (3) hide show
  1. examples/stream/stream.cpp +13 -10
  2. whisper.cpp +2 -2
  3. whisper.h +2 -1
examples/stream/stream.cpp CHANGED
@@ -336,7 +336,7 @@ int main(int argc, char ** argv) {
336
  wparams.print_realtime = false;
337
  wparams.print_timestamps = !params.no_timestamps;
338
  wparams.translate = params.translate;
339
- wparams.no_context = params.no_context;
340
  wparams.single_segment = true;
341
  wparams.max_tokens = params.max_tokens;
342
  wparams.language = params.language.c_str();
@@ -345,9 +345,9 @@ int main(int argc, char ** argv) {
345
  wparams.audio_ctx = params.audio_ctx;
346
  wparams.speed_up = params.speed_up;
347
 
348
- wparams.prompt_tokens = prompt_tokens.data();
349
- wparams.prompt_n_tokens = prompt_tokens.size();
350
-
351
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
352
  fprintf(stderr, "%s: failed to process audio\n", argv[0]);
353
  return 6;
@@ -399,12 +399,15 @@ int main(int argc, char ** argv) {
399
  pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
400
 
401
  // Add tokens of the last full length segment as the prompt
402
- prompt_tokens.clear();
403
- const int n_segments = whisper_full_n_segments(ctx);
404
- for (int i = 0; i < n_segments; ++i) {
405
- const int token_count = whisper_full_n_tokens(ctx, i);
406
- for (int j = 0; j < token_count; ++j) {
407
- prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
 
 
 
408
  }
409
  }
410
  }
 
336
  wparams.print_realtime = false;
337
  wparams.print_timestamps = !params.no_timestamps;
338
  wparams.translate = params.translate;
339
+ wparams.no_context = true;
340
  wparams.single_segment = true;
341
  wparams.max_tokens = params.max_tokens;
342
  wparams.language = params.language.c_str();
 
345
  wparams.audio_ctx = params.audio_ctx;
346
  wparams.speed_up = params.speed_up;
347
 
348
+ wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
349
+ wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
350
+
351
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
352
  fprintf(stderr, "%s: failed to process audio\n", argv[0]);
353
  return 6;
 
399
  pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
400
 
401
  // Add tokens of the last full length segment as the prompt
402
+ if (!params.no_context) {
403
+ prompt_tokens.clear();
404
+
405
+ const int n_segments = whisper_full_n_segments(ctx);
406
+ for (int i = 0; i < n_segments; ++i) {
407
+ const int token_count = whisper_full_n_tokens(ctx, i);
408
+ for (int j = 0; j < token_count; ++j) {
409
+ prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
410
+ }
411
  }
412
  }
413
  }
whisper.cpp CHANGED
@@ -2590,9 +2590,9 @@ int whisper_full(
2590
  prompt_past.clear();
2591
  }
2592
 
2593
- // Prepend the prompt tokens to the prompt_past
2594
  if (params.prompt_tokens && params.prompt_n_tokens > 0) {
2595
- // Parse tokens from the pointer (it points to an std::vector)
2596
  for (int i = 0; i < params.prompt_n_tokens; i++) {
2597
  prompt_past.push_back(params.prompt_tokens[i]);
2598
  }
 
2590
  prompt_past.clear();
2591
  }
2592
 
2593
+ // prepend the prompt tokens to the prompt_past
2594
  if (params.prompt_tokens && params.prompt_n_tokens > 0) {
2595
+ // parse tokens from the pointer
2596
  for (int i = 0; i < params.prompt_n_tokens; i++) {
2597
  prompt_past.push_back(params.prompt_tokens[i]);
2598
  }
whisper.h CHANGED
@@ -208,7 +208,8 @@ extern "C" {
208
  bool speed_up; // speed-up the audio by 2x using Phase Vocoder
209
  int audio_ctx; // overwrite the audio context size (0 = use default)
210
 
211
- // std::vector<whisper_token>: tokens to provide the whisper model as initial prompt
 
212
  const whisper_token * prompt_tokens;
213
  int prompt_n_tokens;
214
 
 
208
  bool speed_up; // speed-up the audio by 2x using Phase Vocoder
209
  int audio_ctx; // overwrite the audio context size (0 = use default)
210
 
211
+ // tokens to provide the whisper model as initial prompt
212
+ // these are prepended to any existing text context from a previous call
213
  const whisper_token * prompt_tokens;
214
  int prompt_n_tokens;
215