Spaces:
Running
Running
stream : add "max_tokens" parameter
Browse filesUsed to limit the number of tokens in a segment.
Useful to battle with word repetition when using partial encoder context
- examples/stream/stream.cpp +1 -0
- whisper.cpp +3 -1
- whisper.h +1 -1
examples/stream/stream.cpp
CHANGED
|
@@ -322,6 +322,7 @@ int main(int argc, char ** argv) {
|
|
| 322 |
{
|
| 323 |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 324 |
|
|
|
|
| 325 |
wparams.print_progress = false;
|
| 326 |
wparams.print_special_tokens = params.print_special_tokens;
|
| 327 |
wparams.print_realtime = false;
|
|
|
|
| 322 |
{
|
| 323 |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 324 |
|
| 325 |
+
wparams.max_tokens = 32;
|
| 326 |
wparams.print_progress = false;
|
| 327 |
wparams.print_special_tokens = params.print_special_tokens;
|
| 328 |
wparams.print_realtime = false;
|
whisper.cpp
CHANGED
|
@@ -2402,6 +2402,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
| 2402 |
/*.thold_pt =*/ 0.01f,
|
| 2403 |
/*.thold_ptsum =*/ 0.01f,
|
| 2404 |
/*.max_len =*/ 0,
|
|
|
|
| 2405 |
|
| 2406 |
/*.speed_up =*/ false,
|
| 2407 |
|
|
@@ -2443,6 +2444,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
| 2443 |
/*.thold_pt =*/ 0.01f,
|
| 2444 |
/*.thold_ptsum =*/ 0.01f,
|
| 2445 |
/*.max_len =*/ 0,
|
|
|
|
| 2446 |
|
| 2447 |
/*.speed_up =*/ false,
|
| 2448 |
|
|
@@ -2685,7 +2687,7 @@ int whisper_full(
|
|
| 2685 |
//}
|
| 2686 |
|
| 2687 |
// end of text token
|
| 2688 |
-
if (token.id == whisper_token_eot(ctx) || (i >
|
| 2689 |
if (result_len == 0) {
|
| 2690 |
if (seek + seek_delta + 100 >= seek_end) {
|
| 2691 |
result_len = i + 1;
|
|
|
|
| 2402 |
/*.thold_pt =*/ 0.01f,
|
| 2403 |
/*.thold_ptsum =*/ 0.01f,
|
| 2404 |
/*.max_len =*/ 0,
|
| 2405 |
+
/*.max_tokens =*/ 0,
|
| 2406 |
|
| 2407 |
/*.speed_up =*/ false,
|
| 2408 |
|
|
|
|
| 2444 |
/*.thold_pt =*/ 0.01f,
|
| 2445 |
/*.thold_ptsum =*/ 0.01f,
|
| 2446 |
/*.max_len =*/ 0,
|
| 2447 |
+
/*.max_tokens =*/ 0,
|
| 2448 |
|
| 2449 |
/*.speed_up =*/ false,
|
| 2450 |
|
|
|
|
| 2687 |
//}
|
| 2688 |
|
| 2689 |
// end of text token
|
| 2690 |
+
if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) {
|
| 2691 |
if (result_len == 0) {
|
| 2692 |
if (seek + seek_delta + 100 >= seek_end) {
|
| 2693 |
result_len = i + 1;
|
whisper.h
CHANGED
|
@@ -25,7 +25,6 @@
|
|
| 25 |
#define WHISPER_CHUNK_SIZE 30
|
| 26 |
|
| 27 |
#define WHISPER_EXPERIMENT_AUDIO_CTX 512
|
| 28 |
-
#define WHISPER_EXPERIMENT_MAX_TOKENS_PER_SEGMENT 32
|
| 29 |
|
| 30 |
#ifdef __cplusplus
|
| 31 |
extern "C" {
|
|
@@ -205,6 +204,7 @@ extern "C" {
|
|
| 205 |
float thold_pt; // timestamp token probability threshold (~0.01)
|
| 206 |
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
|
| 207 |
int max_len; // max segment length in characters
|
|
|
|
| 208 |
|
| 209 |
// [EXPERIMENTAL] speed-up techniques
|
| 210 |
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
|
|
|
| 25 |
#define WHISPER_CHUNK_SIZE 30
|
| 26 |
|
| 27 |
#define WHISPER_EXPERIMENT_AUDIO_CTX 512
|
|
|
|
| 28 |
|
| 29 |
#ifdef __cplusplus
|
| 30 |
extern "C" {
|
|
|
|
| 204 |
float thold_pt; // timestamp token probability threshold (~0.01)
|
| 205 |
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
|
| 206 |
int max_len; // max segment length in characters
|
| 207 |
+
int max_tokens; // max tokens per segment (0 = no limit)
|
| 208 |
|
| 209 |
// [EXPERIMENTAL] speed-up techniques
|
| 210 |
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|