Spaces:
Running
Running
ref #22 : add option to provide multiple input .wav files
Browse files
README.md
CHANGED
|
@@ -31,13 +31,12 @@ For a quick demo, simply run `make base.en`:
|
|
| 31 |
|
| 32 |
```java
|
| 33 |
$ make base.en
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
g++ -pthread -o main ggml.o main.o
|
| 38 |
./main -h
|
| 39 |
|
| 40 |
-
usage: ./main [options]
|
| 41 |
|
| 42 |
options:
|
| 43 |
-h, --help show this help message and exit
|
|
@@ -49,11 +48,11 @@ options:
|
|
| 49 |
-nt, --no_timestamps do not print timestamps
|
| 50 |
-l LANG, --language LANG spoken language (default: en)
|
| 51 |
-m FNAME, --model FNAME model path (default: models/ggml-base.en.bin)
|
| 52 |
-
-f FNAME, --file FNAME input WAV file path
|
| 53 |
|
| 54 |
bash ./download-ggml-model.sh base.en
|
| 55 |
Downloading ggml model base.en ...
|
| 56 |
-
models/ggml-base.en.bin
|
| 57 |
Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
|
| 58 |
You can now use it like this:
|
| 59 |
|
|
@@ -86,20 +85,18 @@ whisper_model_load: adding 1607 extra tokens
|
|
| 86 |
whisper_model_load: ggml ctx size = 163.43 MB
|
| 87 |
whisper_model_load: memory size = 22.83 MB
|
| 88 |
whisper_model_load: model size = 140.54 MB
|
| 89 |
-
log_mel_spectrogram: n_sample = 176000, n_len = 1100
|
| 90 |
-
log_mel_spectrogram: recording length: 11.000000 s
|
| 91 |
|
| 92 |
-
main: processing 176000 samples
|
| 93 |
|
| 94 |
-
[00:00.000 --> 00:11.000] And so my fellow Americans ask not what your country can do for you
|
| 95 |
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
```
|
| 104 |
|
| 105 |
The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
|
|
|
|
| 31 |
|
| 32 |
```java
|
| 33 |
$ make base.en
|
| 34 |
+
cc -O3 -std=c11 -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -pthread -c ggml.c
|
| 35 |
+
c++ -O3 -std=c++11 -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -pthread -c whisper.cpp
|
| 36 |
+
c++ -O3 -std=c++11 -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -pthread main.cpp whisper.o ggml.o -o main
|
|
|
|
| 37 |
./main -h
|
| 38 |
|
| 39 |
+
usage: ./main [options] file0.wav file1.wav ...
|
| 40 |
|
| 41 |
options:
|
| 42 |
-h, --help show this help message and exit
|
|
|
|
| 48 |
-nt, --no_timestamps do not print timestamps
|
| 49 |
-l LANG, --language LANG spoken language (default: en)
|
| 50 |
-m FNAME, --model FNAME model path (default: models/ggml-base.en.bin)
|
| 51 |
+
-f FNAME, --file FNAME input WAV file path
|
| 52 |
|
| 53 |
bash ./download-ggml-model.sh base.en
|
| 54 |
Downloading ggml model base.en ...
|
| 55 |
+
models/ggml-base.en.bin 100%[===================================>] 141.11M 6.49MB/s in 23s
|
| 56 |
Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
|
| 57 |
You can now use it like this:
|
| 58 |
|
|
|
|
| 85 |
whisper_model_load: ggml ctx size = 163.43 MB
|
| 86 |
whisper_model_load: memory size = 22.83 MB
|
| 87 |
whisper_model_load: model size = 140.54 MB
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, lang = en, task = transcribe, timestamps = 1 ...
|
| 90 |
|
| 91 |
+
[00:00.000 --> 00:11.000] And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
|
| 92 |
|
| 93 |
|
| 94 |
+
whisper_print_timings: load time = 77.48 ms
|
| 95 |
+
whisper_print_timings: mel time = 26.10 ms
|
| 96 |
+
whisper_print_timings: sample time = 2.19 ms
|
| 97 |
+
whisper_print_timings: encode time = 632.95 ms / 105.49 ms per layer
|
| 98 |
+
whisper_print_timings: decode time = 85.11 ms / 14.18 ms per layer
|
| 99 |
+
whisper_print_timings: total time = 824.14 ms
|
| 100 |
```
|
| 101 |
|
| 102 |
The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
|
main.cpp
CHANGED
|
@@ -36,7 +36,8 @@ struct whisper_params {
|
|
| 36 |
|
| 37 |
std::string language = "en";
|
| 38 |
std::string model = "models/ggml-base.en.bin";
|
| 39 |
-
|
|
|
|
| 40 |
};
|
| 41 |
|
| 42 |
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
@@ -45,6 +46,11 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 45 |
for (int i = 1; i < argc; i++) {
|
| 46 |
std::string arg = argv[i];
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
if (arg == "-s" || arg == "--seed") {
|
| 49 |
params.seed = std::stoi(argv[++i]);
|
| 50 |
} else if (arg == "-t" || arg == "--threads") {
|
|
@@ -67,7 +73,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 67 |
} else if (arg == "-m" || arg == "--model") {
|
| 68 |
params.model = argv[++i];
|
| 69 |
} else if (arg == "-f" || arg == "--file") {
|
| 70 |
-
params.fname_inp
|
| 71 |
} else if (arg == "-h" || arg == "--help") {
|
| 72 |
whisper_print_usage(argc, argv, params);
|
| 73 |
exit(0);
|
|
@@ -83,7 +89,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 83 |
|
| 84 |
void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
|
| 85 |
fprintf(stderr, "\n");
|
| 86 |
-
fprintf(stderr, "usage: %s [options]
|
| 87 |
fprintf(stderr, "\n");
|
| 88 |
fprintf(stderr, "options:\n");
|
| 89 |
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
@@ -95,7 +101,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
| 95 |
fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
|
| 96 |
fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
|
| 97 |
fprintf(stderr, " -m FNAME, --model FNAME model path (default: %s)\n", params.model.c_str());
|
| 98 |
-
fprintf(stderr, " -f FNAME, --file FNAME input WAV file path
|
| 99 |
fprintf(stderr, "\n");
|
| 100 |
}
|
| 101 |
|
|
@@ -110,106 +116,116 @@ int main(int argc, char ** argv) {
|
|
| 110 |
params.seed = time(NULL);
|
| 111 |
}
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
// whisper init
|
| 114 |
|
| 115 |
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
|
| 142 |
-
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
| 158 |
}
|
| 159 |
}
|
| 160 |
-
}
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
| 170 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
}
|
| 172 |
-
printf("%s: processing %d samples (%.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
| 173 |
-
__func__, int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
|
| 174 |
-
params.language.c_str(),
|
| 175 |
-
params.translate ? "translate" : "transcribe",
|
| 176 |
-
params.no_timestamps ? 0 : 1);
|
| 177 |
-
printf("\n");
|
| 178 |
-
}
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
|
| 212 |
-
|
|
|
|
| 213 |
}
|
| 214 |
}
|
| 215 |
}
|
|
|
|
| 36 |
|
| 37 |
std::string language = "en";
|
| 38 |
std::string model = "models/ggml-base.en.bin";
|
| 39 |
+
|
| 40 |
+
std::vector<std::string> fname_inp = {};
|
| 41 |
};
|
| 42 |
|
| 43 |
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
|
|
| 46 |
for (int i = 1; i < argc; i++) {
|
| 47 |
std::string arg = argv[i];
|
| 48 |
|
| 49 |
+
if (arg[0] != '-') {
|
| 50 |
+
params.fname_inp.push_back(arg);
|
| 51 |
+
continue;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
if (arg == "-s" || arg == "--seed") {
|
| 55 |
params.seed = std::stoi(argv[++i]);
|
| 56 |
} else if (arg == "-t" || arg == "--threads") {
|
|
|
|
| 73 |
} else if (arg == "-m" || arg == "--model") {
|
| 74 |
params.model = argv[++i];
|
| 75 |
} else if (arg == "-f" || arg == "--file") {
|
| 76 |
+
params.fname_inp.push_back(argv[++i]);
|
| 77 |
} else if (arg == "-h" || arg == "--help") {
|
| 78 |
whisper_print_usage(argc, argv, params);
|
| 79 |
exit(0);
|
|
|
|
| 89 |
|
| 90 |
void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
|
| 91 |
fprintf(stderr, "\n");
|
| 92 |
+
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
| 93 |
fprintf(stderr, "\n");
|
| 94 |
fprintf(stderr, "options:\n");
|
| 95 |
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
|
|
| 101 |
fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
|
| 102 |
fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
|
| 103 |
fprintf(stderr, " -m FNAME, --model FNAME model path (default: %s)\n", params.model.c_str());
|
| 104 |
+
fprintf(stderr, " -f FNAME, --file FNAME input WAV file path\n");
|
| 105 |
fprintf(stderr, "\n");
|
| 106 |
}
|
| 107 |
|
|
|
|
| 116 |
params.seed = time(NULL);
|
| 117 |
}
|
| 118 |
|
| 119 |
+
if (params.fname_inp.empty()) {
|
| 120 |
+
fprintf(stderr, "error: no input files specified\n");
|
| 121 |
+
whisper_print_usage(argc, argv, params);
|
| 122 |
+
return 1;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
// whisper init
|
| 126 |
|
| 127 |
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
| 128 |
|
| 129 |
+
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
| 130 |
+
const auto fname_inp = params.fname_inp[f];
|
| 131 |
+
|
| 132 |
+
// WAV input
|
| 133 |
+
std::vector<float> pcmf32;
|
| 134 |
+
{
|
| 135 |
+
drwav wav;
|
| 136 |
+
if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
|
| 137 |
+
fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
|
| 138 |
+
whisper_print_usage(argc, argv, {});
|
| 139 |
+
return 2;
|
| 140 |
+
}
|
| 141 |
|
| 142 |
+
if (wav.channels != 1 && wav.channels != 2) {
|
| 143 |
+
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
|
| 144 |
+
return 3;
|
| 145 |
+
}
|
| 146 |
|
| 147 |
+
if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
|
| 148 |
+
fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
|
| 149 |
+
return 4;
|
| 150 |
+
}
|
| 151 |
|
| 152 |
+
if (wav.bitsPerSample != 16) {
|
| 153 |
+
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
|
| 154 |
+
return 5;
|
| 155 |
+
}
|
| 156 |
|
| 157 |
+
int n = wav.totalPCMFrameCount;
|
| 158 |
|
| 159 |
+
std::vector<int16_t> pcm16;
|
| 160 |
+
pcm16.resize(n*wav.channels);
|
| 161 |
+
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
|
| 162 |
+
drwav_uninit(&wav);
|
| 163 |
|
| 164 |
+
// convert to mono, float
|
| 165 |
+
pcmf32.resize(n);
|
| 166 |
+
if (wav.channels == 1) {
|
| 167 |
+
for (int i = 0; i < n; i++) {
|
| 168 |
+
pcmf32[i] = float(pcm16[i])/32768.0f;
|
| 169 |
+
}
|
| 170 |
+
} else {
|
| 171 |
+
for (int i = 0; i < n; i++) {
|
| 172 |
+
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
| 173 |
+
}
|
| 174 |
}
|
| 175 |
}
|
|
|
|
| 176 |
|
| 177 |
+
// print some info about the processing
|
| 178 |
+
{
|
| 179 |
+
printf("\n");
|
| 180 |
+
if (!whisper_is_multilingual(ctx)) {
|
| 181 |
+
if (params.language != "en" || params.translate) {
|
| 182 |
+
params.language = "en";
|
| 183 |
+
params.translate = false;
|
| 184 |
+
printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
| 185 |
+
}
|
| 186 |
}
|
| 187 |
+
printf("%s: processing '%s' (%d samples, %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
| 188 |
+
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
|
| 189 |
+
params.language.c_str(),
|
| 190 |
+
params.translate ? "translate" : "transcribe",
|
| 191 |
+
params.no_timestamps ? 0 : 1);
|
| 192 |
+
printf("\n");
|
| 193 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
+
// run the inference
|
| 196 |
+
{
|
| 197 |
+
whisper_full_params wparams = whisper_full_default_params(WHISPER_DECODE_GREEDY);
|
| 198 |
+
|
| 199 |
+
wparams.print_realtime = true;
|
| 200 |
+
wparams.print_progress = false;
|
| 201 |
+
wparams.print_timestamps = !params.no_timestamps;
|
| 202 |
+
wparams.print_special_tokens = params.print_special_tokens;
|
| 203 |
+
wparams.translate = params.translate;
|
| 204 |
+
wparams.language = params.language.c_str();
|
| 205 |
+
wparams.n_threads = params.n_threads;
|
| 206 |
+
|
| 207 |
+
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 208 |
+
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
| 209 |
+
return 6;
|
| 210 |
+
}
|
| 211 |
|
| 212 |
+
// print result;
|
| 213 |
+
if (!wparams.print_realtime) {
|
| 214 |
+
printf("\n");
|
| 215 |
|
| 216 |
+
const int n_segments = whisper_full_n_segments(ctx);
|
| 217 |
+
for (int i = 0; i < n_segments; ++i) {
|
| 218 |
+
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 219 |
|
| 220 |
+
if (params.no_timestamps) {
|
| 221 |
+
printf ("%s", text);
|
| 222 |
+
fflush(stdout);
|
| 223 |
+
} else {
|
| 224 |
+
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 225 |
+
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 226 |
|
| 227 |
+
printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
|
| 228 |
+
}
|
| 229 |
}
|
| 230 |
}
|
| 231 |
}
|