Spaces:
Sleeping
Sleeping
bobqianic
commited on
whisper : Replace WHISPER_PRINT_DEBUG with WHISPER_LOG_DEBUG (#1681)
Browse files- whisper.cpp +33 -36
whisper.cpp
CHANGED
|
@@ -122,9 +122,18 @@ WHISPER_ATTRIBUTE_FORMAT(2, 3)
|
|
| 122 |
static void whisper_log_internal (ggml_log_level level, const char * format, ...);
|
| 123 |
static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
| 124 |
|
| 125 |
-
#define WHISPER_LOG_INFO(...) whisper_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
| 126 |
-
#define WHISPER_LOG_WARN(...) whisper_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
| 127 |
#define WHISPER_LOG_ERROR(...) whisper_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
#define WHISPER_ASSERT(x) \
|
| 130 |
do { \
|
|
@@ -134,18 +143,6 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
|
|
| 134 |
} \
|
| 135 |
} while (0)
|
| 136 |
|
| 137 |
-
// define this to enable verbose trace logging - useful for debugging purposes
|
| 138 |
-
//#define WHISPER_DEBUG
|
| 139 |
-
|
| 140 |
-
#if defined(WHISPER_DEBUG)
|
| 141 |
-
#define WHISPER_PRINT_DEBUG(...) \
|
| 142 |
-
do { \
|
| 143 |
-
fprintf(stderr, __VA_ARGS__); \
|
| 144 |
-
} while (0)
|
| 145 |
-
#else
|
| 146 |
-
#define WHISPER_PRINT_DEBUG(...)
|
| 147 |
-
#endif
|
| 148 |
-
|
| 149 |
//#define WHISPER_USE_FLASH_ATTN
|
| 150 |
//#define WHISPER_USE_FLASH_FF
|
| 151 |
#define WHISPER_MAX_DECODERS 8
|
|
@@ -2166,7 +2163,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|
| 2166 |
const int32_t n_kv = ggml_allocr_is_measure(alloc) ? n_ctx : kv_self.n;
|
| 2167 |
const int32_t kv_head = ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head;
|
| 2168 |
|
| 2169 |
-
//
|
| 2170 |
|
| 2171 |
struct ggml_init_params params = {
|
| 2172 |
/*.mem_size =*/ wstate.alloc_decode.meta.size(),
|
|
@@ -4944,7 +4941,7 @@ static void whisper_sequence_score(
|
|
| 4944 |
const auto p = kv.second/(double)cnt;
|
| 4945 |
entropy -= p*log(p);
|
| 4946 |
|
| 4947 |
-
//
|
| 4948 |
}
|
| 4949 |
|
| 4950 |
sequence.entropy = entropy;
|
|
@@ -5010,7 +5007,7 @@ int whisper_full_with_state(
|
|
| 5010 |
// basically don't process anything that is less than 1.0s
|
| 5011 |
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
|
| 5012 |
if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
|
| 5013 |
-
|
| 5014 |
return 0;
|
| 5015 |
}
|
| 5016 |
|
|
@@ -5199,7 +5196,7 @@ int whisper_full_with_state(
|
|
| 5199 |
|
| 5200 |
n_decoders_cur = std::max(1, n_decoders_cur);
|
| 5201 |
|
| 5202 |
-
|
| 5203 |
|
| 5204 |
// TAGS: WHISPER_DECODER_INIT
|
| 5205 |
for (int j = 0; j < n_decoders_cur; ++j) {
|
|
@@ -5243,11 +5240,11 @@ int whisper_full_with_state(
|
|
| 5243 |
prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
|
| 5244 |
|
| 5245 |
// print the prompt
|
| 5246 |
-
|
| 5247 |
for (int i = 0; i < (int) prompt.size(); i++) {
|
| 5248 |
-
|
| 5249 |
}
|
| 5250 |
-
|
| 5251 |
|
| 5252 |
whisper_kv_cache_clear(state->kv_self);
|
| 5253 |
|
|
@@ -5395,7 +5392,7 @@ int whisper_full_with_state(
|
|
| 5395 |
|
| 5396 |
whisper_kv_cache_seq_cp(state->kv_self, cur.decoder_idx, WHISPER_MAX_DECODERS + j, -1, -1);
|
| 5397 |
|
| 5398 |
-
|
| 5399 |
__func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
|
| 5400 |
}
|
| 5401 |
|
|
@@ -5438,7 +5435,7 @@ int whisper_full_with_state(
|
|
| 5438 |
|
| 5439 |
// do not allow to go back in time
|
| 5440 |
if (has_ts && seek_delta > seek_delta_new && result_len < i) {
|
| 5441 |
-
|
| 5442 |
failed = true; // TODO: maybe this is not a failure ?
|
| 5443 |
continue;
|
| 5444 |
}
|
|
@@ -5453,7 +5450,7 @@ int whisper_full_with_state(
|
|
| 5453 |
#ifdef WHISPER_DEBUG
|
| 5454 |
{
|
| 5455 |
const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token.at(token.tid) : "[?]";
|
| 5456 |
-
|
| 5457 |
__func__, i, j, token.id, token.p, tt.c_str(), token.pt, result_len, ctx->vocab.id_to_token.at(token.id).c_str());
|
| 5458 |
}
|
| 5459 |
#endif
|
|
@@ -5467,7 +5464,7 @@ int whisper_full_with_state(
|
|
| 5467 |
if (seek + seek_delta + 100 >= seek_end) {
|
| 5468 |
result_len = i + 1;
|
| 5469 |
} else {
|
| 5470 |
-
|
| 5471 |
failed = true;
|
| 5472 |
continue;
|
| 5473 |
}
|
|
@@ -5478,7 +5475,7 @@ int whisper_full_with_state(
|
|
| 5478 |
seek_delta = 100*WHISPER_CHUNK_SIZE;
|
| 5479 |
}
|
| 5480 |
|
| 5481 |
-
|
| 5482 |
completed = true;
|
| 5483 |
continue;
|
| 5484 |
}
|
|
@@ -5494,7 +5491,7 @@ int whisper_full_with_state(
|
|
| 5494 |
// sometimes, the decoding can get stuck in a repetition loop
|
| 5495 |
// this is an attempt to mitigate such cases - we flag the decoding as failed and use a fallback strategy
|
| 5496 |
if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
|
| 5497 |
-
|
| 5498 |
failed = true;
|
| 5499 |
continue;
|
| 5500 |
}
|
|
@@ -5536,7 +5533,7 @@ int whisper_full_with_state(
|
|
| 5536 |
continue;
|
| 5537 |
}
|
| 5538 |
|
| 5539 |
-
//
|
| 5540 |
|
| 5541 |
decoder.i_batch = batch.n_tokens;
|
| 5542 |
|
|
@@ -5616,11 +5613,11 @@ int whisper_full_with_state(
|
|
| 5616 |
decoder.sequence.tokens.resize(decoder.sequence.result_len);
|
| 5617 |
whisper_sequence_score(params, decoder.sequence);
|
| 5618 |
|
| 5619 |
-
|
| 5620 |
__func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);
|
| 5621 |
|
| 5622 |
if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
|
| 5623 |
-
|
| 5624 |
__func__, j, decoder.sequence.entropy, params.entropy_thold);
|
| 5625 |
|
| 5626 |
decoder.failed = true;
|
|
@@ -5635,7 +5632,7 @@ int whisper_full_with_state(
|
|
| 5635 |
}
|
| 5636 |
}
|
| 5637 |
|
| 5638 |
-
|
| 5639 |
}
|
| 5640 |
|
| 5641 |
bool success = true;
|
|
@@ -5647,7 +5644,7 @@ int whisper_full_with_state(
|
|
| 5647 |
const auto & decoder = state->decoders[best_decoder_id];
|
| 5648 |
|
| 5649 |
if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
|
| 5650 |
-
|
| 5651 |
success = false;
|
| 5652 |
state->n_fail_p++;
|
| 5653 |
}
|
|
@@ -5655,13 +5652,13 @@ int whisper_full_with_state(
|
|
| 5655 |
|
| 5656 |
if (success) {
|
| 5657 |
//for (auto & token : ctx->decoders[best_decoder_id].sequence.tokens) {
|
| 5658 |
-
//
|
| 5659 |
//}
|
| 5660 |
|
| 5661 |
break;
|
| 5662 |
}
|
| 5663 |
|
| 5664 |
-
|
| 5665 |
}
|
| 5666 |
|
| 5667 |
// output results through a user-provided callback
|
|
@@ -5673,7 +5670,7 @@ int whisper_full_with_state(
|
|
| 5673 |
|
| 5674 |
const auto & tokens_cur = best_decoder.sequence.tokens;
|
| 5675 |
|
| 5676 |
-
//
|
| 5677 |
|
| 5678 |
// update prompt_past
|
| 5679 |
prompt_past.clear();
|
|
@@ -5793,7 +5790,7 @@ int whisper_full_with_state(
|
|
| 5793 |
// update audio window
|
| 5794 |
seek += seek_delta;
|
| 5795 |
|
| 5796 |
-
|
| 5797 |
}
|
| 5798 |
}
|
| 5799 |
|
|
|
|
| 122 |
static void whisper_log_internal (ggml_log_level level, const char * format, ...);
|
| 123 |
static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
| 124 |
|
|
|
|
|
|
|
| 125 |
#define WHISPER_LOG_ERROR(...) whisper_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
| 126 |
+
#define WHISPER_LOG_WARN(...) whisper_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
| 127 |
+
#define WHISPER_LOG_INFO(...) whisper_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
| 128 |
+
|
| 129 |
+
// define this to enable verbose trace logging - useful for debugging purposes
|
| 130 |
+
// #define WHISPER_DEBUG
|
| 131 |
+
|
| 132 |
+
#if defined(WHISPER_DEBUG)
|
| 133 |
+
#define WHISPER_LOG_DEBUG(...) whisper_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
| 134 |
+
#else
|
| 135 |
+
#define WHISPER_LOG_DEBUG(...)
|
| 136 |
+
#endif
|
| 137 |
|
| 138 |
#define WHISPER_ASSERT(x) \
|
| 139 |
do { \
|
|
|
|
| 143 |
} \
|
| 144 |
} while (0)
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
//#define WHISPER_USE_FLASH_ATTN
|
| 147 |
//#define WHISPER_USE_FLASH_FF
|
| 148 |
#define WHISPER_MAX_DECODERS 8
|
|
|
|
| 2163 |
const int32_t n_kv = ggml_allocr_is_measure(alloc) ? n_ctx : kv_self.n;
|
| 2164 |
const int32_t kv_head = ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head;
|
| 2165 |
|
| 2166 |
+
//WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
|
| 2167 |
|
| 2168 |
struct ggml_init_params params = {
|
| 2169 |
/*.mem_size =*/ wstate.alloc_decode.meta.size(),
|
|
|
|
| 4941 |
const auto p = kv.second/(double)cnt;
|
| 4942 |
entropy -= p*log(p);
|
| 4943 |
|
| 4944 |
+
//WHISPER_LOG_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
|
| 4945 |
}
|
| 4946 |
|
| 4947 |
sequence.entropy = entropy;
|
|
|
|
| 5007 |
// basically don't process anything that is less than 1.0s
|
| 5008 |
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
|
| 5009 |
if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
|
| 5010 |
+
WHISPER_LOG_DEBUG("%s: input is too short - %d ms < 1000 ms\n", __func__, (seek_end - seek_start)*10);
|
| 5011 |
return 0;
|
| 5012 |
}
|
| 5013 |
|
|
|
|
| 5196 |
|
| 5197 |
n_decoders_cur = std::max(1, n_decoders_cur);
|
| 5198 |
|
| 5199 |
+
WHISPER_LOG_DEBUG("\n%s: strategy = %d, decoding with %d decoders, temperature = %.2f\n", __func__, params.strategy, n_decoders_cur, t_cur);
|
| 5200 |
|
| 5201 |
// TAGS: WHISPER_DECODER_INIT
|
| 5202 |
for (int j = 0; j < n_decoders_cur; ++j) {
|
|
|
|
| 5240 |
prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
|
| 5241 |
|
| 5242 |
// print the prompt
|
| 5243 |
+
WHISPER_LOG_DEBUG("\n\n");
|
| 5244 |
for (int i = 0; i < (int) prompt.size(); i++) {
|
| 5245 |
+
WHISPER_LOG_DEBUG("%s: prompt[%d] = %s\n", __func__, i, ctx->vocab.id_to_token.at(prompt[i]).c_str());
|
| 5246 |
}
|
| 5247 |
+
WHISPER_LOG_DEBUG("\n\n");
|
| 5248 |
|
| 5249 |
whisper_kv_cache_clear(state->kv_self);
|
| 5250 |
|
|
|
|
| 5392 |
|
| 5393 |
whisper_kv_cache_seq_cp(state->kv_self, cur.decoder_idx, WHISPER_MAX_DECODERS + j, -1, -1);
|
| 5394 |
|
| 5395 |
+
WHISPER_LOG_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
|
| 5396 |
__func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
|
| 5397 |
}
|
| 5398 |
|
|
|
|
| 5435 |
|
| 5436 |
// do not allow to go back in time
|
| 5437 |
if (has_ts && seek_delta > seek_delta_new && result_len < i) {
|
| 5438 |
+
WHISPER_LOG_DEBUG("%s: decoder %d: failed due to seek_delta (%d > %d)\n", __func__, j, seek_delta, seek_delta_new);
|
| 5439 |
failed = true; // TODO: maybe this is not a failure ?
|
| 5440 |
continue;
|
| 5441 |
}
|
|
|
|
| 5450 |
#ifdef WHISPER_DEBUG
|
| 5451 |
{
|
| 5452 |
const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token.at(token.tid) : "[?]";
|
| 5453 |
+
WHISPER_LOG_DEBUG("%s: id = %3d, decoder = %d, token = %6d, p = %6.3f, ts = %10s, %6.3f, result_len = %4d '%s'\n",
|
| 5454 |
__func__, i, j, token.id, token.p, tt.c_str(), token.pt, result_len, ctx->vocab.id_to_token.at(token.id).c_str());
|
| 5455 |
}
|
| 5456 |
#endif
|
|
|
|
| 5464 |
if (seek + seek_delta + 100 >= seek_end) {
|
| 5465 |
result_len = i + 1;
|
| 5466 |
} else {
|
| 5467 |
+
WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);
|
| 5468 |
failed = true;
|
| 5469 |
continue;
|
| 5470 |
}
|
|
|
|
| 5475 |
seek_delta = 100*WHISPER_CHUNK_SIZE;
|
| 5476 |
}
|
| 5477 |
|
| 5478 |
+
WHISPER_LOG_DEBUG("%s: decoder %d completed\n", __func__, j);
|
| 5479 |
completed = true;
|
| 5480 |
continue;
|
| 5481 |
}
|
|
|
|
| 5491 |
// sometimes, the decoding can get stuck in a repetition loop
|
| 5492 |
// this is an attempt to mitigate such cases - we flag the decoding as failed and use a fallback strategy
|
| 5493 |
if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
|
| 5494 |
+
WHISPER_LOG_DEBUG("%s: decoder %d: failed due to repetition loop\n", __func__, j);
|
| 5495 |
failed = true;
|
| 5496 |
continue;
|
| 5497 |
}
|
|
|
|
| 5533 |
continue;
|
| 5534 |
}
|
| 5535 |
|
| 5536 |
+
//WHISPER_LOG_DEBUG("%s: decoder %d: token %d, seek_delta %d\n", __func__, j, decoder.sequence.tokens.back().id, decoder.seek_delta);
|
| 5537 |
|
| 5538 |
decoder.i_batch = batch.n_tokens;
|
| 5539 |
|
|
|
|
| 5613 |
decoder.sequence.tokens.resize(decoder.sequence.result_len);
|
| 5614 |
whisper_sequence_score(params, decoder.sequence);
|
| 5615 |
|
| 5616 |
+
WHISPER_LOG_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
|
| 5617 |
__func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);
|
| 5618 |
|
| 5619 |
if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
|
| 5620 |
+
WHISPER_LOG_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
|
| 5621 |
__func__, j, decoder.sequence.entropy, params.entropy_thold);
|
| 5622 |
|
| 5623 |
decoder.failed = true;
|
|
|
|
| 5632 |
}
|
| 5633 |
}
|
| 5634 |
|
| 5635 |
+
WHISPER_LOG_DEBUG("%s: best decoder = %d\n", __func__, best_decoder_id);
|
| 5636 |
}
|
| 5637 |
|
| 5638 |
bool success = true;
|
|
|
|
| 5644 |
const auto & decoder = state->decoders[best_decoder_id];
|
| 5645 |
|
| 5646 |
if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
|
| 5647 |
+
WHISPER_LOG_DEBUG("%s: failed due to avg_logprobs %8.5f < %8.5f\n", __func__, decoder.sequence.avg_logprobs, params.logprob_thold);
|
| 5648 |
success = false;
|
| 5649 |
state->n_fail_p++;
|
| 5650 |
}
|
|
|
|
| 5652 |
|
| 5653 |
if (success) {
|
| 5654 |
//for (auto & token : ctx->decoders[best_decoder_id].sequence.tokens) {
|
| 5655 |
+
// WHISPER_LOG_DEBUG("%s: token = %d, p = %6.3f, pt = %6.3f, ts = %s, str = %s\n", __func__, token.id, token.p, token.pt, ctx->vocab.id_to_token.at(token.tid).c_str(), ctx->vocab.id_to_token.at(token.id).c_str());
|
| 5656 |
//}
|
| 5657 |
|
| 5658 |
break;
|
| 5659 |
}
|
| 5660 |
|
| 5661 |
+
WHISPER_LOG_DEBUG("\n%s: failed to decode with temperature = %.2f\n", __func__, t_cur);
|
| 5662 |
}
|
| 5663 |
|
| 5664 |
// output results through a user-provided callback
|
|
|
|
| 5670 |
|
| 5671 |
const auto & tokens_cur = best_decoder.sequence.tokens;
|
| 5672 |
|
| 5673 |
+
//WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
|
| 5674 |
|
| 5675 |
// update prompt_past
|
| 5676 |
prompt_past.clear();
|
|
|
|
| 5790 |
// update audio window
|
| 5791 |
seek += seek_delta;
|
| 5792 |
|
| 5793 |
+
WHISPER_LOG_DEBUG("seek = %d, seek_delta = %d\n", seek, seek_delta);
|
| 5794 |
}
|
| 5795 |
}
|
| 5796 |
|