ggerganov commited on
Commit
deeab09
·
unverified ·
1 Parent(s): bd16df8

ref #22 : add option to provide multiple input .wav files

Browse files
Files changed (2) hide show
  1. README.md +14 -17
  2. main.cpp +100 -84
README.md CHANGED
@@ -31,13 +31,12 @@ For a quick demo, simply run `make base.en`:
31
 
32
  ```java
33
  $ make base.en
34
-
35
- gcc -pthread -O3 -mavx -mavx2 -mfma -mf16c -c ggml.c
36
- g++ -pthread -O3 -std=c++11 -c main.cpp
37
- g++ -pthread -o main ggml.o main.o
38
  ./main -h
39
 
40
- usage: ./main [options]
41
 
42
  options:
43
  -h, --help show this help message and exit
@@ -49,11 +48,11 @@ options:
49
  -nt, --no_timestamps do not print timestamps
50
  -l LANG, --language LANG spoken language (default: en)
51
  -m FNAME, --model FNAME model path (default: models/ggml-base.en.bin)
52
- -f FNAME, --file FNAME input WAV file path (default: samples/jfk.wav)
53
 
54
  bash ./download-ggml-model.sh base.en
55
  Downloading ggml model base.en ...
56
- models/ggml-base.en.bin 100%[=====================================>] 141.11M 8.58MB/s in 22s
57
  Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
58
  You can now use it like this:
59
 
@@ -86,20 +85,18 @@ whisper_model_load: adding 1607 extra tokens
86
  whisper_model_load: ggml ctx size = 163.43 MB
87
  whisper_model_load: memory size = 22.83 MB
88
  whisper_model_load: model size = 140.54 MB
89
- log_mel_spectrogram: n_sample = 176000, n_len = 1100
90
- log_mel_spectrogram: recording length: 11.000000 s
91
 
92
- main: processing 176000 samples (11.0 sec), 4 threads, lang = english, task = transcribe, timestamps = 1 ...
93
 
94
- [00:00.000 --> 00:11.000] And so my fellow Americans ask not what your country can do for you. Ask what you can do for your country.
95
 
96
 
97
- main: load time = 82.05 ms
98
- main: mel time = 44.15 ms
99
- main: sample time = 1.98 ms
100
- main: encode time = 674.77 ms / 112.46 ms per layer
101
- main: decode time = 82.91 ms
102
- main: total time = 886.29 ms
103
  ```
104
 
105
  The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
 
31
 
32
  ```java
33
  $ make base.en
34
+ cc -O3 -std=c11 -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -pthread -c ggml.c
35
+ c++ -O3 -std=c++11 -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -pthread -c whisper.cpp
36
+ c++ -O3 -std=c++11 -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -pthread main.cpp whisper.o ggml.o -o main
 
37
  ./main -h
38
 
39
+ usage: ./main [options] file0.wav file1.wav ...
40
 
41
  options:
42
  -h, --help show this help message and exit
 
48
  -nt, --no_timestamps do not print timestamps
49
  -l LANG, --language LANG spoken language (default: en)
50
  -m FNAME, --model FNAME model path (default: models/ggml-base.en.bin)
51
+ -f FNAME, --file FNAME input WAV file path
52
 
53
  bash ./download-ggml-model.sh base.en
54
  Downloading ggml model base.en ...
55
+ models/ggml-base.en.bin 100%[===================================>] 141.11M 6.49MB/s in 23s
56
  Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
57
  You can now use it like this:
58
 
 
85
  whisper_model_load: ggml ctx size = 163.43 MB
86
  whisper_model_load: memory size = 22.83 MB
87
  whisper_model_load: model size = 140.54 MB
 
 
88
 
89
+ main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, lang = en, task = transcribe, timestamps = 1 ...
90
 
91
+ [00:00.000 --> 00:11.000] And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
92
 
93
 
94
+ whisper_print_timings: load time = 77.48 ms
95
+ whisper_print_timings: mel time = 26.10 ms
96
+ whisper_print_timings: sample time = 2.19 ms
97
+ whisper_print_timings: encode time = 632.95 ms / 105.49 ms per layer
98
+ whisper_print_timings: decode time = 85.11 ms / 14.18 ms per layer
99
+ whisper_print_timings: total time = 824.14 ms
100
  ```
101
 
102
  The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
main.cpp CHANGED
@@ -36,7 +36,8 @@ struct whisper_params {
36
 
37
  std::string language = "en";
38
  std::string model = "models/ggml-base.en.bin";
39
- std::string fname_inp = "samples/jfk.wav";
 
40
  };
41
 
42
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -45,6 +46,11 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
45
  for (int i = 1; i < argc; i++) {
46
  std::string arg = argv[i];
47
 
 
 
 
 
 
48
  if (arg == "-s" || arg == "--seed") {
49
  params.seed = std::stoi(argv[++i]);
50
  } else if (arg == "-t" || arg == "--threads") {
@@ -67,7 +73,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
67
  } else if (arg == "-m" || arg == "--model") {
68
  params.model = argv[++i];
69
  } else if (arg == "-f" || arg == "--file") {
70
- params.fname_inp = argv[++i];
71
  } else if (arg == "-h" || arg == "--help") {
72
  whisper_print_usage(argc, argv, params);
73
  exit(0);
@@ -83,7 +89,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
83
 
84
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
85
  fprintf(stderr, "\n");
86
- fprintf(stderr, "usage: %s [options]\n", argv[0]);
87
  fprintf(stderr, "\n");
88
  fprintf(stderr, "options:\n");
89
  fprintf(stderr, " -h, --help show this help message and exit\n");
@@ -95,7 +101,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
95
  fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
96
  fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
97
  fprintf(stderr, " -m FNAME, --model FNAME model path (default: %s)\n", params.model.c_str());
98
- fprintf(stderr, " -f FNAME, --file FNAME input WAV file path (default: %s)\n", params.fname_inp.c_str());
99
  fprintf(stderr, "\n");
100
  }
101
 
@@ -110,106 +116,116 @@ int main(int argc, char ** argv) {
110
  params.seed = time(NULL);
111
  }
112
 
 
 
 
 
 
 
113
  // whisper init
114
 
115
  struct whisper_context * ctx = whisper_init(params.model.c_str());
116
 
117
- // WAV input
118
- std::vector<float> pcmf32;
119
- {
120
- drwav wav;
121
- if (!drwav_init_file(&wav, params.fname_inp.c_str(), NULL)) {
122
- fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], params.fname_inp.c_str());
123
- whisper_print_usage(argc, argv, {});
124
- return 2;
125
- }
 
 
 
126
 
127
- if (wav.channels != 1 && wav.channels != 2) {
128
- fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], params.fname_inp.c_str());
129
- return 3;
130
- }
131
 
132
- if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
133
- fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], params.fname_inp.c_str());
134
- return 4;
135
- }
136
 
137
- if (wav.bitsPerSample != 16) {
138
- fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], params.fname_inp.c_str());
139
- return 5;
140
- }
141
 
142
- int n = wav.totalPCMFrameCount;
143
 
144
- std::vector<int16_t> pcm16;
145
- pcm16.resize(n*wav.channels);
146
- drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
147
- drwav_uninit(&wav);
148
 
149
- // convert to mono, float
150
- pcmf32.resize(n);
151
- if (wav.channels == 1) {
152
- for (int i = 0; i < n; i++) {
153
- pcmf32[i] = float(pcm16[i])/32768.0f;
154
- }
155
- } else {
156
- for (int i = 0; i < n; i++) {
157
- pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
 
158
  }
159
  }
160
- }
161
 
162
- // print some info about the processing
163
- {
164
- printf("\n");
165
- if (!whisper_is_multilingual(ctx)) {
166
- if (params.language != "en" || params.translate) {
167
- params.language = "en";
168
- params.translate = false;
169
- printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
 
170
  }
 
 
 
 
 
 
171
  }
172
- printf("%s: processing %d samples (%.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
173
- __func__, int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
174
- params.language.c_str(),
175
- params.translate ? "translate" : "transcribe",
176
- params.no_timestamps ? 0 : 1);
177
- printf("\n");
178
- }
179
 
180
- // run the inference
181
- {
182
- whisper_full_params wparams = whisper_full_default_params(WHISPER_DECODE_GREEDY);
183
-
184
- wparams.print_realtime = true;
185
- wparams.print_progress = false;
186
- wparams.print_timestamps = !params.no_timestamps;
187
- wparams.print_special_tokens = params.print_special_tokens;
188
- wparams.translate = params.translate;
189
- wparams.language = params.language.c_str();
190
- wparams.n_threads = params.n_threads;
191
-
192
- if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
193
- fprintf(stderr, "%s: failed to process audio\n", argv[0]);
194
- return 6;
195
- }
196
 
197
- // print result;
198
- if (!wparams.print_realtime) {
199
- printf("\n");
200
 
201
- const int n_segments = whisper_full_n_segments(ctx);
202
- for (int i = 0; i < n_segments; ++i) {
203
- const char * text = whisper_full_get_segment_text(ctx, i);
204
 
205
- if (params.no_timestamps) {
206
- printf ("%s", text);
207
- fflush(stdout);
208
- } else {
209
- const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
210
- const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
211
 
212
- printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
 
213
  }
214
  }
215
  }
 
36
 
37
  std::string language = "en";
38
  std::string model = "models/ggml-base.en.bin";
39
+
40
+ std::vector<std::string> fname_inp = {};
41
  };
42
 
43
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 
46
  for (int i = 1; i < argc; i++) {
47
  std::string arg = argv[i];
48
 
49
+ if (arg[0] != '-') {
50
+ params.fname_inp.push_back(arg);
51
+ continue;
52
+ }
53
+
54
  if (arg == "-s" || arg == "--seed") {
55
  params.seed = std::stoi(argv[++i]);
56
  } else if (arg == "-t" || arg == "--threads") {
 
73
  } else if (arg == "-m" || arg == "--model") {
74
  params.model = argv[++i];
75
  } else if (arg == "-f" || arg == "--file") {
76
+ params.fname_inp.push_back(argv[++i]);
77
  } else if (arg == "-h" || arg == "--help") {
78
  whisper_print_usage(argc, argv, params);
79
  exit(0);
 
89
 
90
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
91
  fprintf(stderr, "\n");
92
+ fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
93
  fprintf(stderr, "\n");
94
  fprintf(stderr, "options:\n");
95
  fprintf(stderr, " -h, --help show this help message and exit\n");
 
101
  fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
102
  fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
103
  fprintf(stderr, " -m FNAME, --model FNAME model path (default: %s)\n", params.model.c_str());
104
+ fprintf(stderr, " -f FNAME, --file FNAME input WAV file path\n");
105
  fprintf(stderr, "\n");
106
  }
107
 
 
116
  params.seed = time(NULL);
117
  }
118
 
119
+ if (params.fname_inp.empty()) {
120
+ fprintf(stderr, "error: no input files specified\n");
121
+ whisper_print_usage(argc, argv, params);
122
+ return 1;
123
+ }
124
+
125
  // whisper init
126
 
127
  struct whisper_context * ctx = whisper_init(params.model.c_str());
128
 
129
+ for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
130
+ const auto fname_inp = params.fname_inp[f];
131
+
132
+ // WAV input
133
+ std::vector<float> pcmf32;
134
+ {
135
+ drwav wav;
136
+ if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
137
+ fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
138
+ whisper_print_usage(argc, argv, {});
139
+ return 2;
140
+ }
141
 
142
+ if (wav.channels != 1 && wav.channels != 2) {
143
+ fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
144
+ return 3;
145
+ }
146
 
147
+ if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
148
+ fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
149
+ return 4;
150
+ }
151
 
152
+ if (wav.bitsPerSample != 16) {
153
+ fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
154
+ return 5;
155
+ }
156
 
157
+ int n = wav.totalPCMFrameCount;
158
 
159
+ std::vector<int16_t> pcm16;
160
+ pcm16.resize(n*wav.channels);
161
+ drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
162
+ drwav_uninit(&wav);
163
 
164
+ // convert to mono, float
165
+ pcmf32.resize(n);
166
+ if (wav.channels == 1) {
167
+ for (int i = 0; i < n; i++) {
168
+ pcmf32[i] = float(pcm16[i])/32768.0f;
169
+ }
170
+ } else {
171
+ for (int i = 0; i < n; i++) {
172
+ pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
173
+ }
174
  }
175
  }
 
176
 
177
+ // print some info about the processing
178
+ {
179
+ printf("\n");
180
+ if (!whisper_is_multilingual(ctx)) {
181
+ if (params.language != "en" || params.translate) {
182
+ params.language = "en";
183
+ params.translate = false;
184
+ printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
185
+ }
186
  }
187
+ printf("%s: processing '%s' (%d samples, %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
188
+ __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
189
+ params.language.c_str(),
190
+ params.translate ? "translate" : "transcribe",
191
+ params.no_timestamps ? 0 : 1);
192
+ printf("\n");
193
  }
 
 
 
 
 
 
 
194
 
195
+ // run the inference
196
+ {
197
+ whisper_full_params wparams = whisper_full_default_params(WHISPER_DECODE_GREEDY);
198
+
199
+ wparams.print_realtime = true;
200
+ wparams.print_progress = false;
201
+ wparams.print_timestamps = !params.no_timestamps;
202
+ wparams.print_special_tokens = params.print_special_tokens;
203
+ wparams.translate = params.translate;
204
+ wparams.language = params.language.c_str();
205
+ wparams.n_threads = params.n_threads;
206
+
207
+ if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
208
+ fprintf(stderr, "%s: failed to process audio\n", argv[0]);
209
+ return 6;
210
+ }
211
 
212
+ // print result;
213
+ if (!wparams.print_realtime) {
214
+ printf("\n");
215
 
216
+ const int n_segments = whisper_full_n_segments(ctx);
217
+ for (int i = 0; i < n_segments; ++i) {
218
+ const char * text = whisper_full_get_segment_text(ctx, i);
219
 
220
+ if (params.no_timestamps) {
221
+ printf ("%s", text);
222
+ fflush(stdout);
223
+ } else {
224
+ const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
225
+ const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
226
 
227
+ printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
228
+ }
229
  }
230
  }
231
  }