Spaces:
Running
Running
ref #17 : print whisper logs to stderr
Browse filesOnly the transcribed/translted text is printed to stdout.
This way, one can redirect the result to a file.
- main.cpp +11 -11
- whisper.cpp +27 -27
main.cpp
CHANGED
|
@@ -192,21 +192,21 @@ int main(int argc, char ** argv) {
|
|
| 192 |
|
| 193 |
// print some info about the processing
|
| 194 |
{
|
| 195 |
-
|
| 196 |
if (!whisper_is_multilingual(ctx)) {
|
| 197 |
if (params.language != "en" || params.translate) {
|
| 198 |
params.language = "en";
|
| 199 |
params.translate = false;
|
| 200 |
-
|
| 201 |
}
|
| 202 |
}
|
| 203 |
-
|
| 204 |
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
|
| 205 |
params.language.c_str(),
|
| 206 |
params.translate ? "translate" : "transcribe",
|
| 207 |
params.no_timestamps ? 0 : 1);
|
| 208 |
|
| 209 |
-
|
| 210 |
}
|
| 211 |
|
| 212 |
|
|
@@ -230,25 +230,25 @@ int main(int argc, char ** argv) {
|
|
| 230 |
|
| 231 |
// print result
|
| 232 |
if (!wparams.print_realtime) {
|
| 233 |
-
|
| 234 |
|
| 235 |
const int n_segments = whisper_full_n_segments(ctx);
|
| 236 |
for (int i = 0; i < n_segments; ++i) {
|
| 237 |
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 238 |
|
| 239 |
if (params.no_timestamps) {
|
| 240 |
-
|
| 241 |
fflush(stdout);
|
| 242 |
} else {
|
| 243 |
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 244 |
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 245 |
|
| 246 |
-
|
| 247 |
}
|
| 248 |
}
|
| 249 |
}
|
| 250 |
|
| 251 |
-
|
| 252 |
|
| 253 |
// output to text file
|
| 254 |
if (params.output_txt) {
|
|
@@ -260,7 +260,7 @@ int main(int argc, char ** argv) {
|
|
| 260 |
return 8;
|
| 261 |
}
|
| 262 |
|
| 263 |
-
|
| 264 |
|
| 265 |
const int n_segments = whisper_full_n_segments(ctx);
|
| 266 |
for (int i = 0; i < n_segments; ++i) {
|
|
@@ -279,7 +279,7 @@ int main(int argc, char ** argv) {
|
|
| 279 |
return 9;
|
| 280 |
}
|
| 281 |
|
| 282 |
-
|
| 283 |
|
| 284 |
fout_vtt << "WEBVTT\n\n";
|
| 285 |
|
|
@@ -304,7 +304,7 @@ int main(int argc, char ** argv) {
|
|
| 304 |
return 10;
|
| 305 |
}
|
| 306 |
|
| 307 |
-
|
| 308 |
|
| 309 |
const int n_segments = whisper_full_n_segments(ctx);
|
| 310 |
for (int i = 0; i < n_segments; ++i) {
|
|
|
|
| 192 |
|
| 193 |
// print some info about the processing
|
| 194 |
{
|
| 195 |
+
fprintf(stderr, "\n");
|
| 196 |
if (!whisper_is_multilingual(ctx)) {
|
| 197 |
if (params.language != "en" || params.translate) {
|
| 198 |
params.language = "en";
|
| 199 |
params.translate = false;
|
| 200 |
+
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
| 201 |
}
|
| 202 |
}
|
| 203 |
+
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
| 204 |
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
|
| 205 |
params.language.c_str(),
|
| 206 |
params.translate ? "translate" : "transcribe",
|
| 207 |
params.no_timestamps ? 0 : 1);
|
| 208 |
|
| 209 |
+
fprintf(stderr, "\n");
|
| 210 |
}
|
| 211 |
|
| 212 |
|
|
|
|
| 230 |
|
| 231 |
// print result
|
| 232 |
if (!wparams.print_realtime) {
|
| 233 |
+
fprintf(stderr, "\n");
|
| 234 |
|
| 235 |
const int n_segments = whisper_full_n_segments(ctx);
|
| 236 |
for (int i = 0; i < n_segments; ++i) {
|
| 237 |
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 238 |
|
| 239 |
if (params.no_timestamps) {
|
| 240 |
+
fprintf(stderr, "%s", text);
|
| 241 |
fflush(stdout);
|
| 242 |
} else {
|
| 243 |
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 244 |
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 245 |
|
| 246 |
+
fprintf(stderr, "[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
|
| 247 |
}
|
| 248 |
}
|
| 249 |
}
|
| 250 |
|
| 251 |
+
fprintf(stderr, "\n");
|
| 252 |
|
| 253 |
// output to text file
|
| 254 |
if (params.output_txt) {
|
|
|
|
| 260 |
return 8;
|
| 261 |
}
|
| 262 |
|
| 263 |
+
fprintf(stderr, "%s: saving output to '%s.txt'\n", __func__, fname_inp.c_str());
|
| 264 |
|
| 265 |
const int n_segments = whisper_full_n_segments(ctx);
|
| 266 |
for (int i = 0; i < n_segments; ++i) {
|
|
|
|
| 279 |
return 9;
|
| 280 |
}
|
| 281 |
|
| 282 |
+
fprintf(stderr, "%s: saving output to '%s.vtt'\n", __func__, fname_inp.c_str());
|
| 283 |
|
| 284 |
fout_vtt << "WEBVTT\n\n";
|
| 285 |
|
|
|
|
| 304 |
return 10;
|
| 305 |
}
|
| 306 |
|
| 307 |
+
fprintf(stderr, "%s: saving output to '%s.srt'\n", __func__, fname_inp.c_str());
|
| 308 |
|
| 309 |
const int n_segments = whisper_full_n_segments(ctx);
|
| 310 |
for (int i = 0; i < n_segments; ++i) {
|
whisper.cpp
CHANGED
|
@@ -421,7 +421,7 @@ struct whisper_context {
|
|
| 421 |
// see the convert-pt-to-ggml.py script for details
|
| 422 |
//
|
| 423 |
bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
|
| 424 |
-
|
| 425 |
|
| 426 |
auto & model = wctx.model;
|
| 427 |
auto & vocab = wctx.vocab;
|
|
@@ -480,18 +480,18 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
|
|
| 480 |
model.type = e_model::MODEL_LARGE;
|
| 481 |
}
|
| 482 |
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
|
| 496 |
wctx.buf_model.resize(MEM_REQ_MODEL.at(model.type));
|
| 497 |
wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
|
|
@@ -503,7 +503,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
|
|
| 503 |
wctx.buf_compute.size() +
|
| 504 |
wctx.buf_compute_layer.size();
|
| 505 |
|
| 506 |
-
|
| 507 |
}
|
| 508 |
|
| 509 |
// load mel filters
|
|
@@ -553,7 +553,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
|
|
| 553 |
}
|
| 554 |
|
| 555 |
if (n_vocab < model.hparams.n_vocab) {
|
| 556 |
-
|
| 557 |
for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
|
| 558 |
if (i > vocab.token_beg) {
|
| 559 |
word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
|
|
@@ -698,7 +698,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
|
|
| 698 |
|
| 699 |
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
|
| 700 |
|
| 701 |
-
|
| 702 |
}
|
| 703 |
|
| 704 |
// create the ggml context
|
|
@@ -945,7 +945,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
|
|
| 945 |
ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) +
|
| 946 |
ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
|
| 947 |
|
| 948 |
-
|
| 949 |
}
|
| 950 |
|
| 951 |
// load weights
|
|
@@ -1008,10 +1008,10 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
|
|
| 1008 |
n_loaded++;
|
| 1009 |
}
|
| 1010 |
|
| 1011 |
-
|
| 1012 |
|
| 1013 |
if (n_loaded == 0) {
|
| 1014 |
-
|
| 1015 |
} else if (n_loaded != (int) model.tensors.size()) {
|
| 1016 |
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), n_loaded);
|
| 1017 |
return false;
|
|
@@ -2242,13 +2242,13 @@ whisper_token whisper_token_transcribe() {
|
|
| 2242 |
void whisper_print_timings(struct whisper_context * ctx) {
|
| 2243 |
const int64_t t_end_us = ggml_time_us();
|
| 2244 |
|
| 2245 |
-
|
| 2246 |
-
|
| 2247 |
-
|
| 2248 |
-
|
| 2249 |
-
|
| 2250 |
-
|
| 2251 |
-
|
| 2252 |
}
|
| 2253 |
|
| 2254 |
////////////////////////////////////////////////////////////////////////////
|
|
@@ -2349,7 +2349,7 @@ int whisper_full(
|
|
| 2349 |
while (progress_cur >= progress_prev + progress_step) {
|
| 2350 |
progress_prev += progress_step;
|
| 2351 |
if (params.print_progress) {
|
| 2352 |
-
|
| 2353 |
}
|
| 2354 |
}
|
| 2355 |
|
|
|
|
| 421 |
// see the convert-pt-to-ggml.py script for details
|
| 422 |
//
|
| 423 |
bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
|
| 424 |
+
fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str());
|
| 425 |
|
| 426 |
auto & model = wctx.model;
|
| 427 |
auto & vocab = wctx.vocab;
|
|
|
|
| 480 |
model.type = e_model::MODEL_LARGE;
|
| 481 |
}
|
| 482 |
|
| 483 |
+
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 484 |
+
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
|
| 485 |
+
fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
|
| 486 |
+
fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
|
| 487 |
+
fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
|
| 488 |
+
fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
|
| 489 |
+
fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state);
|
| 490 |
+
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
|
| 491 |
+
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
| 492 |
+
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 493 |
+
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
| 494 |
+
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
| 495 |
|
| 496 |
wctx.buf_model.resize(MEM_REQ_MODEL.at(model.type));
|
| 497 |
wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
|
|
|
|
| 503 |
wctx.buf_compute.size() +
|
| 504 |
wctx.buf_compute_layer.size();
|
| 505 |
|
| 506 |
+
fprintf(stderr, "%s: mem_required = %.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
| 507 |
}
|
| 508 |
|
| 509 |
// load mel filters
|
|
|
|
| 553 |
}
|
| 554 |
|
| 555 |
if (n_vocab < model.hparams.n_vocab) {
|
| 556 |
+
fprintf(stderr, "%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
|
| 557 |
for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
|
| 558 |
if (i > vocab.token_beg) {
|
| 559 |
word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
|
|
|
|
| 698 |
|
| 699 |
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
|
| 700 |
|
| 701 |
+
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
| 702 |
}
|
| 703 |
|
| 704 |
// create the ggml context
|
|
|
|
| 945 |
ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) +
|
| 946 |
ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
|
| 947 |
|
| 948 |
+
fprintf(stderr, "%s: memory size = %8.2f MB \n", __func__, memory_size/1024.0/1024.0);
|
| 949 |
}
|
| 950 |
|
| 951 |
// load weights
|
|
|
|
| 1008 |
n_loaded++;
|
| 1009 |
}
|
| 1010 |
|
| 1011 |
+
fprintf(stderr, "%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
|
| 1012 |
|
| 1013 |
if (n_loaded == 0) {
|
| 1014 |
+
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
| 1015 |
} else if (n_loaded != (int) model.tensors.size()) {
|
| 1016 |
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), n_loaded);
|
| 1017 |
return false;
|
|
|
|
| 2242 |
void whisper_print_timings(struct whisper_context * ctx) {
|
| 2243 |
const int64_t t_end_us = ggml_time_us();
|
| 2244 |
|
| 2245 |
+
fprintf(stderr, "\n");
|
| 2246 |
+
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f);
|
| 2247 |
+
fprintf(stderr, "%s: mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f);
|
| 2248 |
+
fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f);
|
| 2249 |
+
fprintf(stderr, "%s: encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
|
| 2250 |
+
fprintf(stderr, "%s: decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
|
| 2251 |
+
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
|
| 2252 |
}
|
| 2253 |
|
| 2254 |
////////////////////////////////////////////////////////////////////////////
|
|
|
|
| 2349 |
while (progress_cur >= progress_prev + progress_step) {
|
| 2350 |
progress_prev += progress_step;
|
| 2351 |
if (params.print_progress) {
|
| 2352 |
+
fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress_prev);
|
| 2353 |
}
|
| 2354 |
}
|
| 2355 |
|