whisper.cpp

Running

App Files Files Community

ggerganov commited on Nov 22, 2022

Commit

28726dd

unverified ·

1 Parent(s): 8ad3dbf

stream : "-kc" now enables context keeping from previous segment (#90)

Browse files

Files changed (3) hide show

examples/stream/stream.cpp +13 -10
whisper.cpp +2 -2
whisper.h +2 -1

examples/stream/stream.cpp CHANGED Viewed

@@ -336,7 +336,7 @@ int main(int argc, char ** argv) {
             wparams.print_realtime       = false;
             wparams.print_timestamps     = !params.no_timestamps;
             wparams.translate            = params.translate;
-            wparams.no_context           = params.no_context;
             wparams.single_segment       = true;
             wparams.max_tokens           = params.max_tokens;
             wparams.language             = params.language.c_str();
@@ -345,9 +345,9 @@ int main(int argc, char ** argv) {
             wparams.audio_ctx            = params.audio_ctx;
             wparams.speed_up             = params.speed_up;
-            wparams.prompt_tokens        = prompt_tokens.data();
-            wparams.prompt_n_tokens      = prompt_tokens.size();
             if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
                 fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                 return 6;
@@ -399,12 +399,15 @@ int main(int argc, char ** argv) {
                 pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
                 // Add tokens of the last full length segment as the prompt
-                prompt_tokens.clear();
-                const int n_segments = whisper_full_n_segments(ctx);
-                for (int i = 0; i < n_segments; ++i) {
-                    const int token_count = whisper_full_n_tokens(ctx, i);
-                    for (int j = 0; j < token_count; ++j) {
-                        prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
                     }
                 }
             }

             wparams.print_realtime       = false;
             wparams.print_timestamps     = !params.no_timestamps;
             wparams.translate            = params.translate;
+            wparams.no_context           = true;
             wparams.single_segment       = true;
             wparams.max_tokens           = params.max_tokens;
             wparams.language             = params.language.c_str();
             wparams.audio_ctx            = params.audio_ctx;
             wparams.speed_up             = params.speed_up;
+            wparams.prompt_tokens        = params.no_context ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens      = params.no_context ? 0       : prompt_tokens.size();
             if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
                 fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                 return 6;
                 pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
                 // Add tokens of the last full length segment as the prompt
+                if (!params.no_context) {
+                    prompt_tokens.clear();
+                    const int n_segments = whisper_full_n_segments(ctx);
+                    for (int i = 0; i < n_segments; ++i) {
+                        const int token_count = whisper_full_n_tokens(ctx, i);
+                        for (int j = 0; j < token_count; ++j) {
+                            prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
+                        }
                     }
                 }
             }

whisper.cpp CHANGED Viewed

@@ -2590,9 +2590,9 @@ int whisper_full(
         prompt_past.clear();
     }
-    // Prepend the prompt tokens to the prompt_past
     if (params.prompt_tokens && params.prompt_n_tokens > 0) {
-        // Parse tokens from the pointer (it points to an std::vector)
         for (int i = 0; i < params.prompt_n_tokens; i++) {
             prompt_past.push_back(params.prompt_tokens[i]);
         }

         prompt_past.clear();
     }
+    // prepend the prompt tokens to the prompt_past
     if (params.prompt_tokens && params.prompt_n_tokens > 0) {
+        // parse tokens from the pointer
         for (int i = 0; i < params.prompt_n_tokens; i++) {
             prompt_past.push_back(params.prompt_tokens[i]);
         }

whisper.h CHANGED Viewed

@@ -208,7 +208,8 @@ extern "C" {
         bool speed_up;  // speed-up the audio by 2x using Phase Vocoder
         int  audio_ctx; // overwrite the audio context size (0 = use default)
-        // std::vector<whisper_token>: tokens to provide the whisper model as initial prompt
         const whisper_token * prompt_tokens;
         int prompt_n_tokens;

         bool speed_up;  // speed-up the audio by 2x using Phase Vocoder
         int  audio_ctx; // overwrite the audio context size (0 = use default)
+        // tokens to provide the whisper model as initial prompt
+        // these are prepended to any existing text context from a previous call
         const whisper_token * prompt_tokens;
         int prompt_n_tokens;