Daniel Tang commited on
Commit
7e3c27c
·
unverified ·
1 Parent(s): 8566207

cli : support "-" for stdout like stdin (#3050)

Browse files

This changes examples/cli/cli.cpp to be like
examples/common-whisper.cpp. "-of -" can be specified (or this can be
inferred from "-" as the input file) to output to stdout. This is useful
for piping to other applications.

Log fname_out consistently when not stdout
- Terminals have stdout=stderr, so remove the message before
successful output to ease copying
- Don't affect actual error messages
- Move opening the ofstream into the factory, fixing missing
open and/or error messages in output_score/output_wts
- Fix struct naming convention

Closes #3048

Files changed (1) hide show
  1. examples/cli/cli.cpp +80 -129
examples/cli/cli.cpp CHANGED
@@ -9,6 +9,7 @@
9
  #include <cstdio>
10
  #include <string>
11
  #include <thread>
 
12
  #include <vector>
13
  #include <cstring>
14
 
@@ -379,15 +380,7 @@ static void whisper_print_segment_callback(struct whisper_context * ctx, struct
379
  }
380
  }
381
 
382
- static bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
383
- std::ofstream fout(fname);
384
- if (!fout.is_open()) {
385
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
386
- return false;
387
- }
388
-
389
- fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
390
-
391
  const int n_segments = whisper_full_n_segments(ctx);
392
  for (int i = 0; i < n_segments; ++i) {
393
  const char * text = whisper_full_get_segment_text(ctx, i);
@@ -402,19 +395,9 @@ static bool output_txt(struct whisper_context * ctx, const char * fname, const w
402
 
403
  fout << speaker << text << "\n";
404
  }
405
-
406
- return true;
407
  }
408
 
409
- static bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
410
- std::ofstream fout(fname);
411
- if (!fout.is_open()) {
412
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
413
- return false;
414
- }
415
-
416
- fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
417
-
418
  fout << "WEBVTT\n\n";
419
 
420
  const int n_segments = whisper_full_n_segments(ctx);
@@ -434,19 +417,9 @@ static bool output_vtt(struct whisper_context * ctx, const char * fname, const w
434
  fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
435
  fout << speaker << text << "\n\n";
436
  }
437
-
438
- return true;
439
  }
440
 
441
- static bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
442
- std::ofstream fout(fname);
443
- if (!fout.is_open()) {
444
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
445
- return false;
446
- }
447
-
448
- fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
449
-
450
  const int n_segments = whisper_full_n_segments(ctx);
451
  for (int i = 0; i < n_segments; ++i) {
452
  const char * text = whisper_full_get_segment_text(ctx, i);
@@ -463,8 +436,6 @@ static bool output_srt(struct whisper_context * ctx, const char * fname, const w
463
  fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
464
  fout << speaker << text << "\n\n";
465
  }
466
-
467
- return true;
468
  }
469
 
470
  static char * escape_double_quotes_and_backslashes(const char * str) {
@@ -530,15 +501,7 @@ static char * escape_double_quotes_in_csv(const char * str) {
530
  return escaped;
531
  }
532
 
533
- static bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
534
- std::ofstream fout(fname);
535
- if (!fout.is_open()) {
536
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
537
- return false;
538
- }
539
-
540
- fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
541
-
542
  const int n_segments = whisper_full_n_segments(ctx);
543
  fout << "start,end,";
544
  if (params.diarize && pcmf32s.size() == 2)
@@ -561,14 +524,9 @@ static bool output_csv(struct whisper_context * ctx, const char * fname, const w
561
  }
562
  fout << "\"" << text_escaped << "\"\n";
563
  }
564
-
565
- return true;
566
  }
567
 
568
- static bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
569
- std::ofstream fout(fname);
570
- fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
571
-
572
  const int n_segments = whisper_full_n_segments(ctx);
573
  // fprintf(stderr,"segments: %d\n",n_segments);
574
  for (int i = 0; i < n_segments; ++i) {
@@ -581,16 +539,14 @@ static bool output_score(struct whisper_context * ctx, const char * fname, const
581
  // fprintf(stderr,"token: %s %f\n",token,probability);
582
  }
583
  }
584
- return true;
585
  }
586
 
587
- static bool output_json(
588
  struct whisper_context * ctx,
589
- const char * fname,
590
  const whisper_params & params,
591
- std::vector<std::vector<float>> pcmf32s,
592
- bool full) {
593
- std::ofstream fout(fname);
594
  int indent = 0;
595
 
596
  auto doindent = [&]() {
@@ -670,12 +626,6 @@ static bool output_json(
670
  end_obj(end);
671
  };
672
 
673
- if (!fout.is_open()) {
674
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
675
- return false;
676
- }
677
-
678
- fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
679
  start_obj(nullptr);
680
  value_s("systeminfo", whisper_print_system_info(), false);
681
  start_obj("model");
@@ -749,17 +699,12 @@ static bool output_json(
749
 
750
  end_arr(true);
751
  end_obj(true);
752
- return true;
753
  }
754
 
755
  // karaoke video generation
756
  // outputs a bash script that uses ffmpeg to generate a video with the subtitles
757
  // TODO: font parameter adjustments
758
- static bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
759
- std::ofstream fout(fname);
760
-
761
- fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
762
-
763
  static const char * font = params.font_path.c_str();
764
 
765
  std::ifstream fin(font);
@@ -875,20 +820,12 @@ static bool output_wts(struct whisper_context * ctx, const char * fname, const c
875
 
876
  fout.close();
877
 
878
- fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
879
 
880
  return true;
881
  }
882
 
883
- static bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
884
- std::ofstream fout(fname);
885
- if (!fout.is_open()) {
886
- fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
887
- return false;
888
- }
889
-
890
- fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
891
-
892
  fout << "[by:whisper.cpp]\n";
893
 
894
  const int n_segments = whisper_full_n_segments(ctx);
@@ -916,8 +853,6 @@ static bool output_lrc(struct whisper_context * ctx, const char * fname, const w
916
 
917
  fout << '[' << timestamp_lrc << ']' << speaker << text << "\n";
918
  }
919
-
920
- return true;
921
  }
922
 
923
 
@@ -1066,8 +1001,52 @@ int main(int argc, char ** argv) {
1066
  }
1067
 
1068
  for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
1069
- const auto fname_inp = params.fname_inp[f];
1070
- const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1071
 
1072
  std::vector<float> pcmf32; // mono-channel F32 PCM
1073
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@@ -1172,7 +1151,7 @@ int main(int argc, char ** argv) {
1172
 
1173
  // this callback is called on each new segment
1174
  if (!wparams.print_realtime) {
1175
- wparams.new_segment_callback = whisper_print_segment_callback;
1176
  wparams.new_segment_callback_user_data = &user_data;
1177
  }
1178
 
@@ -1214,54 +1193,26 @@ int main(int argc, char ** argv) {
1214
 
1215
  // output stuff
1216
  {
1217
- printf("\n");
1218
-
1219
- // output to text file
1220
- if (params.output_txt) {
1221
- const auto fname_txt = fname_out + ".txt";
1222
- output_txt(ctx, fname_txt.c_str(), params, pcmf32s);
1223
- }
1224
-
1225
- // output to VTT file
1226
- if (params.output_vtt) {
1227
- const auto fname_vtt = fname_out + ".vtt";
1228
- output_vtt(ctx, fname_vtt.c_str(), params, pcmf32s);
1229
- }
1230
-
1231
- // output to SRT file
1232
- if (params.output_srt) {
1233
- const auto fname_srt = fname_out + ".srt";
1234
- output_srt(ctx, fname_srt.c_str(), params, pcmf32s);
1235
- }
1236
-
1237
- // output to WTS file
1238
- if (params.output_wts) {
1239
- const auto fname_wts = fname_out + ".wts";
1240
- output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE, pcmf32s);
1241
- }
1242
-
1243
- // output to CSV file
1244
- if (params.output_csv) {
1245
- const auto fname_csv = fname_out + ".csv";
1246
- output_csv(ctx, fname_csv.c_str(), params, pcmf32s);
1247
- }
1248
-
1249
- // output to JSON file
1250
- if (params.output_jsn) {
1251
- const auto fname_jsn = fname_out + ".json";
1252
- output_json(ctx, fname_jsn.c_str(), params, pcmf32s, params.output_jsn_full);
1253
- }
1254
-
1255
- // output to LRC file
1256
- if (params.output_lrc) {
1257
- const auto fname_lrc = fname_out + ".lrc";
1258
- output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s);
1259
- }
1260
-
1261
- // output to score file
1262
- if (params.log_score) {
1263
- const auto fname_score = fname_out + ".score.txt";
1264
- output_score(ctx, fname_score.c_str(), params, pcmf32s);
1265
  }
1266
  }
1267
  }
 
9
  #include <cstdio>
10
  #include <string>
11
  #include <thread>
12
+ #include <utility>
13
  #include <vector>
14
  #include <cstring>
15
 
 
380
  }
381
  }
382
 
383
+ static void output_txt(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
 
 
 
 
 
 
 
 
384
  const int n_segments = whisper_full_n_segments(ctx);
385
  for (int i = 0; i < n_segments; ++i) {
386
  const char * text = whisper_full_get_segment_text(ctx, i);
 
395
 
396
  fout << speaker << text << "\n";
397
  }
 
 
398
  }
399
 
400
+ static void output_vtt(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
 
 
 
 
 
 
 
 
401
  fout << "WEBVTT\n\n";
402
 
403
  const int n_segments = whisper_full_n_segments(ctx);
 
417
  fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
418
  fout << speaker << text << "\n\n";
419
  }
 
 
420
  }
421
 
422
+ static void output_srt(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
 
 
 
 
 
 
 
 
423
  const int n_segments = whisper_full_n_segments(ctx);
424
  for (int i = 0; i < n_segments; ++i) {
425
  const char * text = whisper_full_get_segment_text(ctx, i);
 
436
  fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
437
  fout << speaker << text << "\n\n";
438
  }
 
 
439
  }
440
 
441
  static char * escape_double_quotes_and_backslashes(const char * str) {
 
501
  return escaped;
502
  }
503
 
504
+ static void output_csv(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
 
 
 
 
 
 
 
 
505
  const int n_segments = whisper_full_n_segments(ctx);
506
  fout << "start,end,";
507
  if (params.diarize && pcmf32s.size() == 2)
 
524
  }
525
  fout << "\"" << text_escaped << "\"\n";
526
  }
 
 
527
  }
528
 
529
+ static void output_score(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
 
 
 
530
  const int n_segments = whisper_full_n_segments(ctx);
531
  // fprintf(stderr,"segments: %d\n",n_segments);
532
  for (int i = 0; i < n_segments; ++i) {
 
539
  // fprintf(stderr,"token: %s %f\n",token,probability);
540
  }
541
  }
 
542
  }
543
 
544
+ static void output_json(
545
  struct whisper_context * ctx,
546
+ std::ofstream & fout,
547
  const whisper_params & params,
548
+ std::vector<std::vector<float>> pcmf32s) {
549
+ const bool full = params.output_jsn_full;
 
550
  int indent = 0;
551
 
552
  auto doindent = [&]() {
 
626
  end_obj(end);
627
  };
628
 
 
 
 
 
 
 
629
  start_obj(nullptr);
630
  value_s("systeminfo", whisper_print_system_info(), false);
631
  start_obj("model");
 
699
 
700
  end_arr(true);
701
  end_obj(true);
 
702
  }
703
 
704
  // karaoke video generation
705
  // outputs a bash script that uses ffmpeg to generate a video with the subtitles
706
  // TODO: font parameter adjustments
707
+ static bool output_wts(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s, const char * fname_inp, float t_sec, const char * fname_out) {
 
 
 
 
708
  static const char * font = params.font_path.c_str();
709
 
710
  std::ifstream fin(font);
 
820
 
821
  fout.close();
822
 
823
+ fprintf(stderr, "# %s: run 'source %s' to generate karaoke video\n", __func__, fname_out);
824
 
825
  return true;
826
  }
827
 
828
+ static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
 
 
 
 
 
 
 
 
829
  fout << "[by:whisper.cpp]\n";
830
 
831
  const int n_segments = whisper_full_n_segments(ctx);
 
853
 
854
  fout << '[' << timestamp_lrc << ']' << speaker << text << "\n";
855
  }
 
 
856
  }
857
 
858
 
 
1001
  }
1002
 
1003
  for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
1004
+ const auto & fname_inp = params.fname_inp[f];
1005
+ struct fout_factory {
1006
+ std::string fname_out;
1007
+ const size_t basename_length;
1008
+ const bool is_stdout;
1009
+ bool used_stdout;
1010
+ decltype(whisper_print_segment_callback) * const print_segment_callback;
1011
+ std::ofstream fout;
1012
+
1013
+ fout_factory (const std::string & fname_out_, const std::string & fname_inp, whisper_params & params) :
1014
+ fname_out{!fname_out_.empty() ? fname_out_ : fname_inp},
1015
+ basename_length{fname_out.size()},
1016
+ is_stdout{fname_out == "-"},
1017
+ used_stdout{},
1018
+ print_segment_callback{is_stdout ? nullptr : whisper_print_segment_callback} {
1019
+ if (!print_segment_callback) {
1020
+ params.print_progress = false;
1021
+ }
1022
+ }
1023
+
1024
+ bool open(const char * ext, const char * function) {
1025
+ if (is_stdout) {
1026
+ if (std::exchange(used_stdout, true)) {
1027
+ fprintf(stderr, "warning: Not appending multiple file formats to stdout\n");
1028
+ return false;
1029
+ }
1030
+ #ifdef _WIN32
1031
+ fout = std::ofstream{"CON"};
1032
+ #else
1033
+ fout = std::ofstream{"/dev/stdout"};
1034
+ #endif
1035
+ // Not using fprintf stderr here because it might equal stdout
1036
+ // Also assuming /dev is mounted
1037
+ return true;
1038
+ }
1039
+ fname_out.resize(basename_length);
1040
+ fname_out += ext;
1041
+ fout = std::ofstream{fname_out};
1042
+ if (!fout.is_open()) {
1043
+ fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
1044
+ return false;
1045
+ }
1046
+ fprintf(stderr, "%s: saving output to '%s'\n", function, fname_out.c_str());
1047
+ return true;
1048
+ }
1049
+ } fout_factory{f < (int) params.fname_out.size() ? params.fname_out[f] : "", fname_inp, params};
1050
 
1051
  std::vector<float> pcmf32; // mono-channel F32 PCM
1052
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
 
1151
 
1152
  // this callback is called on each new segment
1153
  if (!wparams.print_realtime) {
1154
+ wparams.new_segment_callback = fout_factory.print_segment_callback;
1155
  wparams.new_segment_callback_user_data = &user_data;
1156
  }
1157
 
 
1193
 
1194
  // output stuff
1195
  {
1196
+ // macros to stringify function name
1197
+ #define output_func(func, ext, param, ...) if (param && fout_factory.open(ext, #func)) {\
1198
+ func(ctx, fout_factory.fout, params, __VA_ARGS__); \
1199
+ }
1200
+ #define output_ext(ext, ...) output_func(output_##ext, "." #ext, params.output_##ext, __VA_ARGS__)
1201
+
1202
+ output_ext(txt, pcmf32s);
1203
+ output_ext(vtt, pcmf32s);
1204
+ output_ext(srt, pcmf32s);
1205
+ output_ext(wts, pcmf32s, fname_inp.c_str(), float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE, fout_factory.fname_out.c_str());
1206
+ output_ext(csv, pcmf32s);
1207
+ output_func(output_json, ".json", params.output_jsn, pcmf32s);
1208
+ output_ext(lrc, pcmf32s);
1209
+ output_func(output_score, ".score.txt", params.log_score, pcmf32s);
1210
+
1211
+ #undef output_ext
1212
+ #undef output_func
1213
+
1214
+ if (fout_factory.is_stdout && !fout_factory.used_stdout) {
1215
+ fprintf(stderr, "warning: '--output-file -' used without any other '--output-*'");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1216
  }
1217
  }
1218
  }