Spaces:
Running
Running
whisper : add single-timestamp logic (#2629)
Browse files* Fix hallucinations during silence
When the predicted tokens end with a single timestamp the the entire 30 segment should be considered as done, to avoid hallucinations for the remaining part of segment.
This behaviour is on par with openai's whisper. Refer to logic related to `single_timestamp_ending` in https://github.com/openai/whisper/blob/main/whisper/transcribe.py
* Accept review comments related to formatting.
Co-authored-by: Georgi Gerganov <[email protected]>
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- src/whisper.cpp +10 -1
src/whisper.cpp
CHANGED
|
@@ -6060,7 +6060,7 @@ int whisper_full_with_state(
|
|
| 6060 |
{
|
| 6061 |
const auto & best_decoder = state->decoders[best_decoder_id];
|
| 6062 |
|
| 6063 |
-
|
| 6064 |
const auto result_len = best_decoder.sequence.result_len;
|
| 6065 |
|
| 6066 |
const auto & tokens_cur = best_decoder.sequence.tokens;
|
|
@@ -6201,6 +6201,15 @@ int whisper_full_with_state(
|
|
| 6201 |
}
|
| 6202 |
}
|
| 6203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6204 |
// update audio window
|
| 6205 |
seek += seek_delta;
|
| 6206 |
|
|
|
|
| 6060 |
{
|
| 6061 |
const auto & best_decoder = state->decoders[best_decoder_id];
|
| 6062 |
|
| 6063 |
+
auto seek_delta = best_decoder.seek_delta;
|
| 6064 |
const auto result_len = best_decoder.sequence.result_len;
|
| 6065 |
|
| 6066 |
const auto & tokens_cur = best_decoder.sequence.tokens;
|
|
|
|
| 6201 |
}
|
| 6202 |
}
|
| 6203 |
|
| 6204 |
+
// ref: https://github.com/ggerganov/whisper.cpp/pull/2629
|
| 6205 |
+
const bool single_timestamp_ending = tokens_cur.size() > 1 &&
|
| 6206 |
+
tokens_cur[tokens_cur.size() - 2].id < whisper_token_beg(ctx) &&
|
| 6207 |
+
tokens_cur[tokens_cur.size() - 1].id > whisper_token_beg(ctx);
|
| 6208 |
+
if (single_timestamp_ending) {
|
| 6209 |
+
WHISPER_LOG_DEBUG("single timestamp ending - skip entire chunk\n");
|
| 6210 |
+
seek_delta = std::min(seek_end - seek, WHISPER_CHUNK_SIZE * 100);
|
| 6211 |
+
}
|
| 6212 |
+
|
| 6213 |
// update audio window
|
| 6214 |
seek += seek_delta;
|
| 6215 |
|