ggerganov commited on
Commit
67be3ed
·
1 Parent(s): 4860566

ref #17 : print whisper logs to stderr

Browse files

Only the transcribed/translted text is printed to stdout.
This way, one can redirect the result to a file.

Files changed (2) hide show
  1. main.cpp +11 -11
  2. whisper.cpp +27 -27
main.cpp CHANGED
@@ -192,21 +192,21 @@ int main(int argc, char ** argv) {
192
 
193
  // print some info about the processing
194
  {
195
- printf("\n");
196
  if (!whisper_is_multilingual(ctx)) {
197
  if (params.language != "en" || params.translate) {
198
  params.language = "en";
199
  params.translate = false;
200
- printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
201
  }
202
  }
203
- printf("%s: processing '%s' (%d samples, %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
204
  __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
205
  params.language.c_str(),
206
  params.translate ? "translate" : "transcribe",
207
  params.no_timestamps ? 0 : 1);
208
 
209
- printf("\n");
210
  }
211
 
212
 
@@ -230,25 +230,25 @@ int main(int argc, char ** argv) {
230
 
231
  // print result
232
  if (!wparams.print_realtime) {
233
- printf("\n");
234
 
235
  const int n_segments = whisper_full_n_segments(ctx);
236
  for (int i = 0; i < n_segments; ++i) {
237
  const char * text = whisper_full_get_segment_text(ctx, i);
238
 
239
  if (params.no_timestamps) {
240
- printf ("%s", text);
241
  fflush(stdout);
242
  } else {
243
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
244
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
245
 
246
- printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
247
  }
248
  }
249
  }
250
 
251
- printf("\n");
252
 
253
  // output to text file
254
  if (params.output_txt) {
@@ -260,7 +260,7 @@ int main(int argc, char ** argv) {
260
  return 8;
261
  }
262
 
263
- printf("%s: saving output to '%s.txt'\n", __func__, fname_inp.c_str());
264
 
265
  const int n_segments = whisper_full_n_segments(ctx);
266
  for (int i = 0; i < n_segments; ++i) {
@@ -279,7 +279,7 @@ int main(int argc, char ** argv) {
279
  return 9;
280
  }
281
 
282
- printf("%s: saving output to '%s.vtt'\n", __func__, fname_inp.c_str());
283
 
284
  fout_vtt << "WEBVTT\n\n";
285
 
@@ -304,7 +304,7 @@ int main(int argc, char ** argv) {
304
  return 10;
305
  }
306
 
307
- printf("%s: saving output to '%s.srt'\n", __func__, fname_inp.c_str());
308
 
309
  const int n_segments = whisper_full_n_segments(ctx);
310
  for (int i = 0; i < n_segments; ++i) {
 
192
 
193
  // print some info about the processing
194
  {
195
+ fprintf(stderr, "\n");
196
  if (!whisper_is_multilingual(ctx)) {
197
  if (params.language != "en" || params.translate) {
198
  params.language = "en";
199
  params.translate = false;
200
+ fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
201
  }
202
  }
203
+ fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
204
  __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
205
  params.language.c_str(),
206
  params.translate ? "translate" : "transcribe",
207
  params.no_timestamps ? 0 : 1);
208
 
209
+ fprintf(stderr, "\n");
210
  }
211
 
212
 
 
230
 
231
  // print result
232
  if (!wparams.print_realtime) {
233
+ fprintf(stderr, "\n");
234
 
235
  const int n_segments = whisper_full_n_segments(ctx);
236
  for (int i = 0; i < n_segments; ++i) {
237
  const char * text = whisper_full_get_segment_text(ctx, i);
238
 
239
  if (params.no_timestamps) {
240
+ fprintf(stderr, "%s", text);
241
  fflush(stdout);
242
  } else {
243
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
244
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
245
 
246
+ fprintf(stderr, "[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
247
  }
248
  }
249
  }
250
 
251
+ fprintf(stderr, "\n");
252
 
253
  // output to text file
254
  if (params.output_txt) {
 
260
  return 8;
261
  }
262
 
263
+ fprintf(stderr, "%s: saving output to '%s.txt'\n", __func__, fname_inp.c_str());
264
 
265
  const int n_segments = whisper_full_n_segments(ctx);
266
  for (int i = 0; i < n_segments; ++i) {
 
279
  return 9;
280
  }
281
 
282
+ fprintf(stderr, "%s: saving output to '%s.vtt'\n", __func__, fname_inp.c_str());
283
 
284
  fout_vtt << "WEBVTT\n\n";
285
 
 
304
  return 10;
305
  }
306
 
307
+ fprintf(stderr, "%s: saving output to '%s.srt'\n", __func__, fname_inp.c_str());
308
 
309
  const int n_segments = whisper_full_n_segments(ctx);
310
  for (int i = 0; i < n_segments; ++i) {
whisper.cpp CHANGED
@@ -421,7 +421,7 @@ struct whisper_context {
421
  // see the convert-pt-to-ggml.py script for details
422
  //
423
  bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
424
- printf("%s: loading model from '%s'\n", __func__, fname.c_str());
425
 
426
  auto & model = wctx.model;
427
  auto & vocab = wctx.vocab;
@@ -480,18 +480,18 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
480
  model.type = e_model::MODEL_LARGE;
481
  }
482
 
483
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
484
- printf("%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
485
- printf("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
486
- printf("%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
487
- printf("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
488
- printf("%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
489
- printf("%s: n_text_state = %d\n", __func__, hparams.n_text_state);
490
- printf("%s: n_text_head = %d\n", __func__, hparams.n_text_head);
491
- printf("%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
492
- printf("%s: n_mels = %d\n", __func__, hparams.n_mels);
493
- printf("%s: f16 = %d\n", __func__, hparams.f16);
494
- printf("%s: type = %d\n", __func__, model.type);
495
 
496
  wctx.buf_model.resize(MEM_REQ_MODEL.at(model.type));
497
  wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
@@ -503,7 +503,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
503
  wctx.buf_compute.size() +
504
  wctx.buf_compute_layer.size();
505
 
506
- printf("%s: mem_required = %.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
507
  }
508
 
509
  // load mel filters
@@ -553,7 +553,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
553
  }
554
 
555
  if (n_vocab < model.hparams.n_vocab) {
556
- printf("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
557
  for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
558
  if (i > vocab.token_beg) {
559
  word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
@@ -698,7 +698,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
698
 
699
  ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
700
 
701
- printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
702
  }
703
 
704
  // create the ggml context
@@ -945,7 +945,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
945
  ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) +
946
  ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
947
 
948
- printf("%s: memory size = %8.2f MB \n", __func__, memory_size/1024.0/1024.0);
949
  }
950
 
951
  // load weights
@@ -1008,10 +1008,10 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
1008
  n_loaded++;
1009
  }
1010
 
1011
- printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
1012
 
1013
  if (n_loaded == 0) {
1014
- printf("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
1015
  } else if (n_loaded != (int) model.tensors.size()) {
1016
  fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), n_loaded);
1017
  return false;
@@ -2242,13 +2242,13 @@ whisper_token whisper_token_transcribe() {
2242
  void whisper_print_timings(struct whisper_context * ctx) {
2243
  const int64_t t_end_us = ggml_time_us();
2244
 
2245
- printf("\n");
2246
- printf("%s: load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f);
2247
- printf("%s: mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f);
2248
- printf("%s: sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f);
2249
- printf("%s: encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
2250
- printf("%s: decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
2251
- printf("%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
2252
  }
2253
 
2254
  ////////////////////////////////////////////////////////////////////////////
@@ -2349,7 +2349,7 @@ int whisper_full(
2349
  while (progress_cur >= progress_prev + progress_step) {
2350
  progress_prev += progress_step;
2351
  if (params.print_progress) {
2352
- printf("%s: progress = %3d%%\n", __func__, progress_prev);
2353
  }
2354
  }
2355
 
 
421
  // see the convert-pt-to-ggml.py script for details
422
  //
423
  bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
424
+ fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str());
425
 
426
  auto & model = wctx.model;
427
  auto & vocab = wctx.vocab;
 
480
  model.type = e_model::MODEL_LARGE;
481
  }
482
 
483
+ fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
484
+ fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
485
+ fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
486
+ fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
487
+ fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
488
+ fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
489
+ fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state);
490
+ fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
491
+ fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
492
+ fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
493
+ fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
494
+ fprintf(stderr, "%s: type = %d\n", __func__, model.type);
495
 
496
  wctx.buf_model.resize(MEM_REQ_MODEL.at(model.type));
497
  wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
 
503
  wctx.buf_compute.size() +
504
  wctx.buf_compute_layer.size();
505
 
506
+ fprintf(stderr, "%s: mem_required = %.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
507
  }
508
 
509
  // load mel filters
 
553
  }
554
 
555
  if (n_vocab < model.hparams.n_vocab) {
556
+ fprintf(stderr, "%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
557
  for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
558
  if (i > vocab.token_beg) {
559
  word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
 
698
 
699
  ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
700
 
701
+ fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
702
  }
703
 
704
  // create the ggml context
 
945
  ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) +
946
  ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
947
 
948
+ fprintf(stderr, "%s: memory size = %8.2f MB \n", __func__, memory_size/1024.0/1024.0);
949
  }
950
 
951
  // load weights
 
1008
  n_loaded++;
1009
  }
1010
 
1011
+ fprintf(stderr, "%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
1012
 
1013
  if (n_loaded == 0) {
1014
+ fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
1015
  } else if (n_loaded != (int) model.tensors.size()) {
1016
  fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), n_loaded);
1017
  return false;
 
2242
  void whisper_print_timings(struct whisper_context * ctx) {
2243
  const int64_t t_end_us = ggml_time_us();
2244
 
2245
+ fprintf(stderr, "\n");
2246
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f);
2247
+ fprintf(stderr, "%s: mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f);
2248
+ fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f);
2249
+ fprintf(stderr, "%s: encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
2250
+ fprintf(stderr, "%s: decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
2251
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
2252
  }
2253
 
2254
  ////////////////////////////////////////////////////////////////////////////
 
2349
  while (progress_cur >= progress_prev + progress_step) {
2350
  progress_prev += progress_step;
2351
  if (params.print_progress) {
2352
+ fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress_prev);
2353
  }
2354
  }
2355