Spaces:
Running
Running
stream : "-kc" now enables context keeping from previous segment (#90)
Browse files- examples/stream/stream.cpp +13 -10
- whisper.cpp +2 -2
- whisper.h +2 -1
examples/stream/stream.cpp
CHANGED
|
@@ -336,7 +336,7 @@ int main(int argc, char ** argv) {
|
|
| 336 |
wparams.print_realtime = false;
|
| 337 |
wparams.print_timestamps = !params.no_timestamps;
|
| 338 |
wparams.translate = params.translate;
|
| 339 |
-
wparams.no_context =
|
| 340 |
wparams.single_segment = true;
|
| 341 |
wparams.max_tokens = params.max_tokens;
|
| 342 |
wparams.language = params.language.c_str();
|
|
@@ -345,9 +345,9 @@ int main(int argc, char ** argv) {
|
|
| 345 |
wparams.audio_ctx = params.audio_ctx;
|
| 346 |
wparams.speed_up = params.speed_up;
|
| 347 |
|
| 348 |
-
wparams.prompt_tokens = prompt_tokens.data();
|
| 349 |
-
wparams.prompt_n_tokens = prompt_tokens.size();
|
| 350 |
-
|
| 351 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 352 |
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
| 353 |
return 6;
|
|
@@ -399,12 +399,15 @@ int main(int argc, char ** argv) {
|
|
| 399 |
pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
|
| 400 |
|
| 401 |
// Add tokens of the last full length segment as the prompt
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
const int
|
| 406 |
-
for (int
|
| 407 |
-
|
|
|
|
|
|
|
|
|
|
| 408 |
}
|
| 409 |
}
|
| 410 |
}
|
|
|
|
| 336 |
wparams.print_realtime = false;
|
| 337 |
wparams.print_timestamps = !params.no_timestamps;
|
| 338 |
wparams.translate = params.translate;
|
| 339 |
+
wparams.no_context = true;
|
| 340 |
wparams.single_segment = true;
|
| 341 |
wparams.max_tokens = params.max_tokens;
|
| 342 |
wparams.language = params.language.c_str();
|
|
|
|
| 345 |
wparams.audio_ctx = params.audio_ctx;
|
| 346 |
wparams.speed_up = params.speed_up;
|
| 347 |
|
| 348 |
+
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
|
| 349 |
+
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
|
| 350 |
+
|
| 351 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 352 |
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
| 353 |
return 6;
|
|
|
|
| 399 |
pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
|
| 400 |
|
| 401 |
// Add tokens of the last full length segment as the prompt
|
| 402 |
+
if (!params.no_context) {
|
| 403 |
+
prompt_tokens.clear();
|
| 404 |
+
|
| 405 |
+
const int n_segments = whisper_full_n_segments(ctx);
|
| 406 |
+
for (int i = 0; i < n_segments; ++i) {
|
| 407 |
+
const int token_count = whisper_full_n_tokens(ctx, i);
|
| 408 |
+
for (int j = 0; j < token_count; ++j) {
|
| 409 |
+
prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
|
| 410 |
+
}
|
| 411 |
}
|
| 412 |
}
|
| 413 |
}
|
whisper.cpp
CHANGED
|
@@ -2590,9 +2590,9 @@ int whisper_full(
|
|
| 2590 |
prompt_past.clear();
|
| 2591 |
}
|
| 2592 |
|
| 2593 |
-
//
|
| 2594 |
if (params.prompt_tokens && params.prompt_n_tokens > 0) {
|
| 2595 |
-
//
|
| 2596 |
for (int i = 0; i < params.prompt_n_tokens; i++) {
|
| 2597 |
prompt_past.push_back(params.prompt_tokens[i]);
|
| 2598 |
}
|
|
|
|
| 2590 |
prompt_past.clear();
|
| 2591 |
}
|
| 2592 |
|
| 2593 |
+
// prepend the prompt tokens to the prompt_past
|
| 2594 |
if (params.prompt_tokens && params.prompt_n_tokens > 0) {
|
| 2595 |
+
// parse tokens from the pointer
|
| 2596 |
for (int i = 0; i < params.prompt_n_tokens; i++) {
|
| 2597 |
prompt_past.push_back(params.prompt_tokens[i]);
|
| 2598 |
}
|
whisper.h
CHANGED
|
@@ -208,7 +208,8 @@ extern "C" {
|
|
| 208 |
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
| 209 |
int audio_ctx; // overwrite the audio context size (0 = use default)
|
| 210 |
|
| 211 |
-
//
|
|
|
|
| 212 |
const whisper_token * prompt_tokens;
|
| 213 |
int prompt_n_tokens;
|
| 214 |
|
|
|
|
| 208 |
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
| 209 |
int audio_ctx; // overwrite the audio context size (0 = use default)
|
| 210 |
|
| 211 |
+
// tokens to provide the whisper model as initial prompt
|
| 212 |
+
// these are prepended to any existing text context from a previous call
|
| 213 |
const whisper_token * prompt_tokens;
|
| 214 |
int prompt_n_tokens;
|
| 215 |
|