ggerganov commited on
Commit
2ee248a
·
unverified ·
1 Parent(s): 2512003

command.wasm : add voice assistant example for the Web (#171)

Browse files

Same as the command-line tool "command", but runs in the browser

Also, added helper script "extra/deploy-wasm.sh" and fixed some timing
constants for the WASM examples.

README.md CHANGED
@@ -34,7 +34,7 @@ As an example, here is a video of running the model on an iPhone 13 device - ful
34
 
35
  https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
36
 
37
- You can also easily make your own offline voice assistant application:
38
 
39
  https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
40
 
 
34
 
35
  https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
36
 
37
+ You can also easily make your own offline voice assistant application: [command](examples/command)
38
 
39
  https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
40
 
examples/CMakeLists.txt CHANGED
@@ -21,6 +21,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
21
  if (EMSCRIPTEN)
22
  add_subdirectory(whisper.wasm)
23
  add_subdirectory(stream.wasm)
 
24
  add_subdirectory(talk.wasm)
25
  else()
26
  add_subdirectory(main)
 
21
  if (EMSCRIPTEN)
22
  add_subdirectory(whisper.wasm)
23
  add_subdirectory(stream.wasm)
24
+ add_subdirectory(command.wasm)
25
  add_subdirectory(talk.wasm)
26
  else()
27
  add_subdirectory(main)
examples/command.wasm/CMakeLists.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # libcommand
3
+ #
4
+
5
+ set(TARGET libcommand)
6
+
7
+ add_executable(${TARGET}
8
+ emscripten.cpp
9
+ )
10
+
11
+ target_link_libraries(${TARGET} PRIVATE
12
+ whisper
13
+ )
14
+
15
+ unset(EXTRA_FLAGS)
16
+
17
+ if (WHISPER_WASM_SINGLE_FILE)
18
+ set(EXTRA_FLAGS "-s SINGLE_FILE=1")
19
+ message(STATUS "Embedding WASM inside command.js")
20
+
21
+ add_custom_command(
22
+ TARGET ${TARGET} POST_BUILD
23
+ COMMAND ${CMAKE_COMMAND} -E copy
24
+ ${CMAKE_BINARY_DIR}/bin/libcommand.js
25
+ ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/command.wasm/command.js
26
+ )
27
+ endif()
28
+
29
+ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
30
+ --bind \
31
+ -s USE_PTHREADS=1 \
32
+ -s PTHREAD_POOL_SIZE=8 \
33
+ -s INITIAL_MEMORY=1024MB \
34
+ -s TOTAL_MEMORY=1024MB \
35
+ -s FORCE_FILESYSTEM=1 \
36
+ -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
37
+ ${EXTRA_FLAGS} \
38
+ ")
39
+
40
+ #
41
+ # command.wasm
42
+ #
43
+
44
+ set(TARGET command.wasm)
45
+
46
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
47
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
examples/command.wasm/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # command.wasm
2
+
3
+ This is a basic Voice Assistant example that accepts voice commands from the microphone.
4
+ It runs in fully in the browser via WebAseembly.
5
+
6
+ Online demo: https://whisper.ggerganov.com/command/
7
+
8
+ Terminal version: https://github.com/ggerganov/whisper.cpp/examples/command
9
+
10
+ ## Build instructions
11
+
12
+ ```bash
13
+ # build using Emscripten (v3.1.2)
14
+ git clone https://github.com/ggerganov/whisper.cpp
15
+ cd whisper.cpp
16
+ mkdir build-em && cd build-em
17
+ emcmake cmake ..
18
+ make -j
19
+
20
+ # copy the produced page to your HTTP path
21
+ cp bin/command.wasm/* /path/to/html/
22
+ cp bin/libcommand.worker.js /path/to/html/
23
+ ```
examples/command.wasm/emscripten.cpp ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml.h"
2
+ #include "whisper.h"
3
+
4
+ #include <emscripten.h>
5
+ #include <emscripten/bind.h>
6
+
7
+ #include <atomic>
8
+ #include <cmath>
9
+ #include <mutex>
10
+ #include <string>
11
+ #include <thread>
12
+ #include <vector>
13
+ #include <regex>
14
+
15
+ constexpr int N_THREAD = 8;
16
+
17
+ std::vector<struct whisper_context *> g_contexts(4, nullptr);
18
+
19
+ std::mutex g_mutex;
20
+ std::thread g_worker;
21
+
22
+ std::atomic<bool> g_running(false);
23
+
24
+ std::string g_status = "";
25
+ std::string g_status_forced = "";
26
+ std::string g_transcribed = "";
27
+
28
+ std::vector<float> g_pcmf32;
29
+
30
+ static std::string trim(const std::string & s) {
31
+ std::regex e("^\\s+|\\s+$");
32
+ return std::regex_replace(s, e, "");
33
+ }
34
+
35
+ static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
36
+ const float rc = 1.0f / (2.0f * M_PI * cutoff);
37
+ const float dt = 1.0f / sample_rate;
38
+ const float alpha = dt / (rc + dt);
39
+
40
+ float y = data[0];
41
+
42
+ for (size_t i = 1; i < data.size(); i++) {
43
+ y = alpha * (y + data[i] - data[i - 1]);
44
+ data[i] = y;
45
+ }
46
+ }
47
+
48
+ // compute similarity between two strings using Levenshtein distance
49
+ static float similarity(const std::string & s0, const std::string & s1) {
50
+ const size_t len0 = s0.size() + 1;
51
+ const size_t len1 = s1.size() + 1;
52
+
53
+ std::vector<int> col(len1, 0);
54
+ std::vector<int> prevCol(len1, 0);
55
+
56
+ for (size_t i = 0; i < len1; i++) {
57
+ prevCol[i] = i;
58
+ }
59
+
60
+ for (size_t i = 0; i < len0; i++) {
61
+ col[0] = i;
62
+ for (size_t j = 1; j < len1; j++) {
63
+ col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
64
+ }
65
+ col.swap(prevCol);
66
+ }
67
+
68
+ const float dist = prevCol[len1 - 1];
69
+
70
+ return 1.0f - (dist / std::max(s0.size(), s1.size()));
71
+ }
72
+
73
+ void command_set_status(const std::string & status) {
74
+ std::lock_guard<std::mutex> lock(g_mutex);
75
+ g_status = status;
76
+ }
77
+
78
+ bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
79
+ const int n_samples = pcmf32.size();
80
+ const int n_samples_last = (sample_rate * last_ms) / 1000;
81
+
82
+ if (n_samples_last >= n_samples) {
83
+ // not enough samples - assume no speech
84
+ return false;
85
+ }
86
+
87
+ if (freq_thold > 0.0f) {
88
+ high_pass_filter(pcmf32, freq_thold, sample_rate);
89
+ }
90
+
91
+ float energy_all = 0.0f;
92
+ float energy_last = 0.0f;
93
+
94
+ for (size_t i = 0; i < n_samples; i++) {
95
+ energy_all += fabsf(pcmf32[i]);
96
+
97
+ if (i >= n_samples - n_samples_last) {
98
+ energy_last += fabsf(pcmf32[i]);
99
+ }
100
+ }
101
+
102
+ energy_all /= n_samples;
103
+ energy_last /= n_samples_last;
104
+
105
+ if (verbose) {
106
+ fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
107
+ }
108
+
109
+ if (energy_last > vad_thold*energy_all) {
110
+ return false;
111
+ }
112
+
113
+ return true;
114
+ }
115
+
116
+ std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
117
+ const auto t_start = std::chrono::high_resolution_clock::now();
118
+
119
+ prob = 0.0f;
120
+ t_ms = 0;
121
+
122
+ if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
123
+ return "";
124
+ }
125
+
126
+ int prob_n = 0;
127
+ std::string result;
128
+
129
+ const int n_segments = whisper_full_n_segments(ctx);
130
+ for (int i = 0; i < n_segments; ++i) {
131
+ const char * text = whisper_full_get_segment_text(ctx, i);
132
+
133
+ result += text;
134
+
135
+ const int n_tokens = whisper_full_n_tokens(ctx, i);
136
+ for (int j = 0; j < n_tokens; ++j) {
137
+ const auto token = whisper_full_get_token_data(ctx, i, j);
138
+
139
+ prob += token.p;
140
+ ++prob_n;
141
+ }
142
+ }
143
+
144
+ if (prob_n > 0) {
145
+ prob /= prob_n;
146
+ }
147
+
148
+ const auto t_end = std::chrono::high_resolution_clock::now();
149
+ t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
150
+
151
+ return result;
152
+ }
153
+
154
+ void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
155
+ const int64_t n_samples = (ms * sample_rate) / 1000;
156
+
157
+ int64_t n_take = 0;
158
+ if (g_pcmf32.size() < n_samples) {
159
+ n_take = g_pcmf32.size();
160
+ } else {
161
+ n_take = n_samples;
162
+ }
163
+
164
+ audio.resize(n_take);
165
+ std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin());
166
+ }
167
+
168
+ void command_main(size_t index) {
169
+ command_set_status("loading data ...");
170
+
171
+ struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
172
+
173
+ wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
174
+ wparams.offset_ms = 0;
175
+ wparams.translate = false;
176
+ wparams.no_context = true;
177
+ wparams.single_segment = true;
178
+ wparams.print_realtime = false;
179
+ wparams.print_progress = false;
180
+ wparams.print_timestamps = true;
181
+ wparams.print_special = false;
182
+
183
+ wparams.max_tokens = 32;
184
+ wparams.audio_ctx = 768; // partial encoder context for better performance
185
+
186
+ wparams.language = "en";
187
+
188
+ printf("command: using %d threads\n", wparams.n_threads);
189
+
190
+ bool is_running = true;
191
+ bool have_prompt = false;
192
+ bool ask_prompt = true;
193
+ bool print_energy = false;
194
+
195
+ float prob0 = 0.0f;
196
+ float prob = 0.0f;
197
+
198
+ std::vector<float> pcmf32_cur;
199
+ std::vector<float> pcmf32_prompt;
200
+
201
+ const std::string k_prompt = "Ok Whisper, start listening for commands.";
202
+
203
+ // whisper context
204
+ auto & ctx = g_contexts[index];
205
+
206
+ const int32_t vad_ms = 2000;
207
+ const int32_t prompt_ms = 5000;
208
+ const int32_t command_ms = 4000;
209
+
210
+ const float vad_thold = 0.1f;
211
+ const float freq_thold = -1.0f;
212
+
213
+ while (g_running) {
214
+ // delay
215
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
216
+
217
+ if (ask_prompt) {
218
+ fprintf(stdout, "\n");
219
+ fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
220
+ fprintf(stdout, "\n");
221
+
222
+ {
223
+ char txt[1024];
224
+ snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str());
225
+ command_set_status(txt);
226
+ }
227
+
228
+ ask_prompt = false;
229
+ }
230
+
231
+ int64_t t_ms = 0;
232
+
233
+ {
234
+ command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
235
+
236
+ if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
237
+ fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
238
+ command_set_status("Speech detected! Processing ...");
239
+
240
+ if (!have_prompt) {
241
+ command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
242
+
243
+ const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms));
244
+
245
+ fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
246
+
247
+ const float sim = similarity(txt, k_prompt);
248
+
249
+ if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
250
+ fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
251
+ ask_prompt = true;
252
+ } else {
253
+ fprintf(stdout, "\n");
254
+ fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
255
+ fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
256
+ fprintf(stdout, "\n");
257
+
258
+ {
259
+ char txt[1024];
260
+ snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ...");
261
+ command_set_status(txt);
262
+ }
263
+
264
+ // save the audio for the prompt
265
+ pcmf32_prompt = pcmf32_cur;
266
+ have_prompt = true;
267
+ }
268
+ } else {
269
+ command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
270
+
271
+ // prepend the prompt audio
272
+ pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
273
+
274
+ const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms));
275
+
276
+ prob = 100.0f*(prob - prob0);
277
+
278
+ fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
279
+
280
+ // find the prompt in the text
281
+ float best_sim = 0.0f;
282
+ size_t best_len = 0;
283
+ for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
284
+ const auto prompt = txt.substr(0, n);
285
+
286
+ const float sim = similarity(prompt, k_prompt);
287
+
288
+ //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
289
+
290
+ if (sim > best_sim) {
291
+ best_sim = sim;
292
+ best_len = n;
293
+ }
294
+ }
295
+
296
+ const std::string command = ::trim(txt.substr(best_len));
297
+
298
+ fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
299
+ fprintf(stdout, "\n");
300
+
301
+ {
302
+ char txt[1024];
303
+ snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms);
304
+ command_set_status(txt);
305
+ }
306
+ {
307
+ std::lock_guard<std::mutex> lock(g_mutex);
308
+ g_transcribed = command;
309
+ }
310
+ }
311
+
312
+ g_pcmf32.clear();
313
+ }
314
+ }
315
+ }
316
+
317
+ if (index < g_contexts.size()) {
318
+ whisper_free(g_contexts[index]);
319
+ g_contexts[index] = nullptr;
320
+ }
321
+ }
322
+
323
+ EMSCRIPTEN_BINDINGS(command) {
324
+ emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
325
+ for (size_t i = 0; i < g_contexts.size(); ++i) {
326
+ if (g_contexts[i] == nullptr) {
327
+ g_contexts[i] = whisper_init(path_model.c_str());
328
+ if (g_contexts[i] != nullptr) {
329
+ g_running = true;
330
+ if (g_worker.joinable()) {
331
+ g_worker.join();
332
+ }
333
+ g_worker = std::thread([i]() {
334
+ command_main(i);
335
+ });
336
+
337
+ return i + 1;
338
+ } else {
339
+ return (size_t) 0;
340
+ }
341
+ }
342
+ }
343
+
344
+ return (size_t) 0;
345
+ }));
346
+
347
+ emscripten::function("free", emscripten::optional_override([](size_t index) {
348
+ if (g_running) {
349
+ g_running = false;
350
+ }
351
+ }));
352
+
353
+ emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
354
+ --index;
355
+
356
+ if (index >= g_contexts.size()) {
357
+ return -1;
358
+ }
359
+
360
+ if (g_contexts[index] == nullptr) {
361
+ return -2;
362
+ }
363
+
364
+ {
365
+ std::lock_guard<std::mutex> lock(g_mutex);
366
+ const int n = audio["length"].as<int>();
367
+
368
+ emscripten::val heap = emscripten::val::module_property("HEAPU8");
369
+ emscripten::val memory = heap["buffer"];
370
+
371
+ g_pcmf32.resize(n);
372
+
373
+ emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
374
+ memoryView.call<void>("set", audio);
375
+ }
376
+
377
+ return 0;
378
+ }));
379
+
380
+ emscripten::function("get_transcribed", emscripten::optional_override([]() {
381
+ std::string transcribed;
382
+
383
+ {
384
+ std::lock_guard<std::mutex> lock(g_mutex);
385
+ transcribed = std::move(g_transcribed);
386
+ }
387
+
388
+ return transcribed;
389
+ }));
390
+
391
+ emscripten::function("get_status", emscripten::optional_override([]() {
392
+ std::string status;
393
+
394
+ {
395
+ std::lock_guard<std::mutex> lock(g_mutex);
396
+ status = g_status_forced.empty() ? g_status : g_status_forced;
397
+ }
398
+
399
+ return status;
400
+ }));
401
+
402
+ emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
403
+ {
404
+ std::lock_guard<std::mutex> lock(g_mutex);
405
+ g_status_forced = status;
406
+ }
407
+ }));
408
+ }
examples/command.wasm/index-tmpl.html ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-us">
3
+ <head>
4
+ <title>command : Voice assistant example using Whisper + WebAssembly</title>
5
+
6
+ <style>
7
+ #output {
8
+ width: 100%;
9
+ height: 100%;
10
+ margin: 0 auto;
11
+ margin-top: 10px;
12
+ border-left: 0px;
13
+ border-right: 0px;
14
+ padding-left: 0px;
15
+ padding-right: 0px;
16
+ display: block;
17
+ background-color: black;
18
+ color: white;
19
+ font-size: 10px;
20
+ font-family: 'Lucida Console', Monaco, monospace;
21
+ outline: none;
22
+ white-space: pre;
23
+ overflow-wrap: normal;
24
+ overflow-x: scroll;
25
+ }
26
+ </style>
27
+ </head>
28
+ <body>
29
+ <div id="main-container">
30
+ <b>command : Voice assistant example using Whisper + WebAssembly</b>
31
+
32
+ <br><br>
33
+
34
+ You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">GitHub</a>.
35
+
36
+ <br><br>
37
+
38
+ <hr>
39
+
40
+ Select the model you would like to use, click the "Start" button and follow the instructions.
41
+
42
+ <br><br>
43
+
44
+ <div id="model-whisper">
45
+ Whisper model: <span id="model-whisper-status"></span>
46
+ <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
47
+ <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
48
+ <span id="fetch-whisper-progress"></span>
49
+
50
+ <!--
51
+ <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
52
+ -->
53
+ </div>
54
+
55
+ <br>
56
+
57
+ <div id="input">
58
+ <button id="start" onclick="onStart()" disabled>Start</button>
59
+ <button id="stop" onclick="onStop()" disabled>Stop</button>
60
+ <button id="clear" onclick="clearCache()">Clear Cache</button>
61
+ </div>
62
+
63
+ <br>
64
+
65
+ <div id="state">
66
+ Status: <b><span id="state-status">not started</span></b>
67
+
68
+ <pre id="state-transcribed">[The recognized voice commands will be displayed here]</pre>
69
+ </div>
70
+
71
+ <hr>
72
+
73
+ Debug output:
74
+ <textarea id="output" rows="20"></textarea>
75
+
76
+ <br>
77
+
78
+ <b>Troubleshooting</b>
79
+
80
+ <br><br>
81
+
82
+ The page does some heavy computations, so make sure:
83
+
84
+ <ul>
85
+ <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
86
+ <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
87
+ <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
88
+ </ul>
89
+
90
+ <div class="cell-version">
91
+ <span>
92
+ |
93
+ Build time: <span class="nav-link">@GIT_DATE@</span> |
94
+ Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
95
+ Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
96
+ <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">Source Code</a> |
97
+ </span>
98
+ </div>
99
+ </div>
100
+
101
+ <script type="text/javascript" src="helpers.js"></script>
102
+ <script type='text/javascript'>
103
+ // web audio context
104
+ var context = null;
105
+
106
+ // audio data
107
+ var audio = null;
108
+ var audio0 = null;
109
+
110
+ // the command instance
111
+ var instance = null;
112
+
113
+ // model name
114
+ var model_whisper = null;
115
+
116
+ var Module = {
117
+ print: printTextarea,
118
+ printErr: printTextarea,
119
+ setStatus: function(text) {
120
+ printTextarea('js: ' + text);
121
+ },
122
+ monitorRunDependencies: function(left) {
123
+ },
124
+ preRun: function() {
125
+ printTextarea('js: Preparing ...');
126
+ },
127
+ postRun: function() {
128
+ printTextarea('js: Initialized successfully!');
129
+ }
130
+ };
131
+
132
+ //
133
+ // fetch models
134
+ //
135
+
136
+ let dbVersion = 1
137
+ let dbName = 'whisper.ggerganov.com';
138
+ let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
139
+
140
+ function storeFS(fname, buf) {
141
+ // write to WASM file using FS_createDataFile
142
+ // if the file exists, delete it
143
+ try {
144
+ Module.FS_unlink(fname);
145
+ } catch (e) {
146
+ // ignore
147
+ }
148
+
149
+ Module.FS_createDataFile("/", fname, buf, true, true);
150
+
151
+ printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
152
+
153
+ document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
154
+
155
+ if (model_whisper != null) {
156
+ document.getElementById('start').disabled = false;
157
+ document.getElementById('stop' ).disabled = true;
158
+ }
159
+ }
160
+
161
+ function loadWhisper(model) {
162
+ let urls = {
163
+ 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
164
+ 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
165
+ };
166
+
167
+ let sizes = {
168
+ 'tiny.en': 75,
169
+ 'base.en': 142,
170
+ };
171
+
172
+ let url = urls[model];
173
+ let dst = 'whisper.bin';
174
+ let size_mb = sizes[model];
175
+
176
+ model_whisper = model;
177
+
178
+ document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
179
+ document.getElementById('fetch-whisper-base-en').style.display = 'none';
180
+ document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
181
+
182
+ cbProgress = function(p) {
183
+ let el = document.getElementById('fetch-whisper-progress');
184
+ el.innerHTML = Math.round(100*p) + '%';
185
+ };
186
+
187
+ cbCancel = function() {
188
+ var el;
189
+ el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
190
+ el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
191
+ el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
192
+ };
193
+
194
+ loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
195
+ }
196
+
197
+ //
198
+ // microphone
199
+ //
200
+
201
+ const kSampleRate = 16000;
202
+ const kRestartRecording_s = 120;
203
+ const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
204
+
205
+ var mediaRecorder = null;
206
+ var doRecording = false;
207
+ var startTime = 0;
208
+
209
+ window.AudioContext = window.AudioContext || window.webkitAudioContext;
210
+ window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
211
+
212
+ function stopRecording() {
213
+ Module.set_status("paused");
214
+ doRecording = false;
215
+ audio0 = null;
216
+ audio = null;
217
+ context = null;
218
+ }
219
+
220
+ function startRecording() {
221
+ if (!context) {
222
+ context = new AudioContext({
223
+ sampleRate: kSampleRate,
224
+ channelCount: 1,
225
+ echoCancellation: false,
226
+ autoGainControl: true,
227
+ noiseSuppression: true,
228
+ });
229
+ }
230
+
231
+ Module.set_status("");
232
+
233
+ document.getElementById('start').disabled = true;
234
+ document.getElementById('stop').disabled = false;
235
+
236
+ doRecording = true;
237
+ startTime = Date.now();
238
+
239
+ var chunks = [];
240
+ var stream = null;
241
+
242
+ navigator.mediaDevices.getUserMedia({audio: true, video: false})
243
+ .then(function(s) {
244
+ stream = s;
245
+ mediaRecorder = new MediaRecorder(stream);
246
+ mediaRecorder.ondataavailable = function(e) {
247
+ chunks.push(e.data);
248
+
249
+ var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
250
+ var reader = new FileReader();
251
+
252
+ reader.onload = function(event) {
253
+ var buf = new Uint8Array(reader.result);
254
+
255
+ if (!context) {
256
+ return;
257
+ }
258
+ context.decodeAudioData(buf.buffer, function(audioBuffer) {
259
+ var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
260
+ var source = offlineContext.createBufferSource();
261
+ source.buffer = audioBuffer;
262
+ source.connect(offlineContext.destination);
263
+ source.start(0);
264
+
265
+ offlineContext.startRendering().then(function(renderedBuffer) {
266
+ audio = renderedBuffer.getChannelData(0);
267
+
268
+ //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
269
+
270
+ var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
271
+ if (audio0 != null) {
272
+ audioAll.set(audio0, 0);
273
+ }
274
+ audioAll.set(audio, audio0 == null ? 0 : audio0.length);
275
+
276
+ if (instance) {
277
+ Module.set_audio(instance, audioAll);
278
+ }
279
+ });
280
+ }, function(e) {
281
+ audio = null;
282
+ });
283
+ }
284
+
285
+ reader.readAsArrayBuffer(blob);
286
+ };
287
+
288
+ mediaRecorder.onstop = function(e) {
289
+ if (doRecording) {
290
+ setTimeout(function() {
291
+ startRecording();
292
+ });
293
+ }
294
+ };
295
+
296
+ mediaRecorder.start(kIntervalAudio_ms);
297
+ })
298
+ .catch(function(err) {
299
+ printTextarea('js: error getting audio stream: ' + err);
300
+ });
301
+
302
+ var interval = setInterval(function() {
303
+ if (!doRecording) {
304
+ clearInterval(interval);
305
+ mediaRecorder.stop();
306
+ stream.getTracks().forEach(function(track) {
307
+ track.stop();
308
+ });
309
+
310
+ document.getElementById('start').disabled = false;
311
+ document.getElementById('stop').disabled = true;
312
+
313
+ mediaRecorder = null;
314
+ }
315
+
316
+ // if audio length is more than kRestartRecording_s seconds, restart recording
317
+ if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
318
+ if (doRecording) {
319
+ //printTextarea('js: restarting recording');
320
+
321
+ clearInterval(interval);
322
+ audio0 = audio;
323
+ audio = null;
324
+ mediaRecorder.stop();
325
+ stream.getTracks().forEach(function(track) {
326
+ track.stop();
327
+ });
328
+ }
329
+ }
330
+ }, 100);
331
+ }
332
+
333
+ //
334
+ // main
335
+ //
336
+
337
+ var nLines = 0;
338
+ var intervalUpdate = null;
339
+ var transcribedAll = '';
340
+
341
+ function onStart() {
342
+ if (!instance) {
343
+ instance = Module.init('whisper.bin');
344
+
345
+ if (instance) {
346
+ printTextarea("js: whisper initialized, instance: " + instance);
347
+ }
348
+ }
349
+
350
+ if (!instance) {
351
+ printTextarea("js: failed to initialize whisper");
352
+ return;
353
+ }
354
+
355
+ startRecording();
356
+
357
+ intervalUpdate = setInterval(function() {
358
+ var transcribed = Module.get_transcribed();
359
+
360
+ if (transcribed != null && transcribed.length > 1) {
361
+ transcribedAll += transcribed + '<br>';
362
+ nLines++;
363
+
364
+ // if more than 10 lines, remove the first line
365
+ if (nLines > 10) {
366
+ var i = transcribedAll.indexOf('<br>');
367
+ if (i > 0) {
368
+ transcribedAll = transcribedAll.substring(i + 4);
369
+ nLines--;
370
+ }
371
+ }
372
+ }
373
+
374
+ document.getElementById('state-status').innerHTML = Module.get_status();
375
+ document.getElementById('state-transcribed').innerHTML = transcribedAll;
376
+ }, 100);
377
+ }
378
+
379
+ function onStop() {
380
+ stopRecording();
381
+ }
382
+
383
+ </script>
384
+ <script type="text/javascript" src="command.js"></script>
385
+ </body>
386
+ </html>
examples/command/README.md CHANGED
@@ -13,6 +13,8 @@ More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/
13
 
14
  https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
15
 
 
 
16
  ## Building
17
 
18
  The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
 
13
 
14
  https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
15
 
16
+ Web version: https://github.com/ggerganov/whisper.cpp/examples/command.wasm
17
+
18
  ## Building
19
 
20
  The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
examples/command/command.cpp CHANGED
@@ -535,7 +535,7 @@ int main(int argc, char ** argv) {
535
 
536
  bool is_running = true;
537
  bool have_prompt = false;
538
- bool ask_prompt = true;
539
 
540
  float prob0 = 0.0f;
541
  float prob = 0.0f;
 
535
 
536
  bool is_running = true;
537
  bool have_prompt = false;
538
+ bool ask_prompt = true;
539
 
540
  float prob0 = 0.0f;
541
  float prob = 0.0f;
examples/stream.wasm/index-tmpl.html CHANGED
@@ -100,12 +100,6 @@
100
 
101
  <script type="text/javascript" src="helpers.js"></script>
102
  <script type='text/javascript'>
103
- const kRestartRecording_s = 15;
104
- const kSampleRate = 16000;
105
-
106
- window.AudioContext = window.AudioContext || window.webkitAudioContext;
107
- window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
108
-
109
  // web audio context
110
  var context = null;
111
 
@@ -204,10 +198,17 @@
204
  // microphone
205
  //
206
 
 
 
 
 
207
  var mediaRecorder = null;
208
  var doRecording = false;
209
  var startTime = 0;
210
 
 
 
 
211
  function stopRecording() {
212
  Module.set_status("paused");
213
  doRecording = false;
@@ -219,7 +220,7 @@
219
  function startRecording() {
220
  if (!context) {
221
  context = new AudioContext({
222
- sampleRate: 16000,
223
  channelCount: 1,
224
  echoCancellation: false,
225
  autoGainControl: true,
@@ -292,7 +293,7 @@
292
  }
293
  };
294
 
295
- mediaRecorder.start(5000);
296
  })
297
  .catch(function(err) {
298
  printTextarea('js: error getting audio stream: ' + err);
@@ -326,7 +327,7 @@
326
  });
327
  }
328
  }
329
- }, 250);
330
  }
331
 
332
  //
 
100
 
101
  <script type="text/javascript" src="helpers.js"></script>
102
  <script type='text/javascript'>
 
 
 
 
 
 
103
  // web audio context
104
  var context = null;
105
 
 
198
  // microphone
199
  //
200
 
201
+ const kSampleRate = 16000;
202
+ const kRestartRecording_s = 120;
203
+ const kIntervalAudio_ms = 5000; // pass the recorded audio to the C++ instance at this rate
204
+
205
  var mediaRecorder = null;
206
  var doRecording = false;
207
  var startTime = 0;
208
 
209
+ window.AudioContext = window.AudioContext || window.webkitAudioContext;
210
+ window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
211
+
212
  function stopRecording() {
213
  Module.set_status("paused");
214
  doRecording = false;
 
220
  function startRecording() {
221
  if (!context) {
222
  context = new AudioContext({
223
+ sampleRate: kSampleRate,
224
  channelCount: 1,
225
  echoCancellation: false,
226
  autoGainControl: true,
 
293
  }
294
  };
295
 
296
+ mediaRecorder.start(kIntervalAudio_ms);
297
  })
298
  .catch(function(err) {
299
  printTextarea('js: error getting audio stream: ' + err);
 
327
  });
328
  }
329
  }
330
+ }, 100);
331
  }
332
 
333
  //
examples/talk.wasm/index-tmpl.html CHANGED
@@ -160,12 +160,6 @@
160
 
161
  <script type="text/javascript" src="helpers.js"></script>
162
  <script type='text/javascript'>
163
- const kRestartRecording_s = 15;
164
- const kSampleRate = 16000;
165
-
166
- window.AudioContext = window.AudioContext || window.webkitAudioContext;
167
- window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
168
-
169
  // web audio context
170
  var context = null;
171
 
@@ -342,10 +336,17 @@
342
  // microphone
343
  //
344
 
 
 
 
 
345
  var mediaRecorder = null;
346
  var doRecording = false;
347
  var startTime = 0;
348
 
 
 
 
349
  function stopRecording() {
350
  Module.set_status("paused");
351
  doRecording = false;
@@ -357,7 +358,7 @@
357
  function startRecording() {
358
  if (!context) {
359
  context = new AudioContext({
360
- sampleRate: 16000,
361
  channelCount: 1,
362
  echoCancellation: false,
363
  autoGainControl: true,
@@ -431,7 +432,7 @@
431
  }
432
  };
433
 
434
- mediaRecorder.start(250);
435
  })
436
  .catch(function(err) {
437
  printTextarea('js: error getting audio stream: ' + err);
@@ -466,7 +467,7 @@
466
  });
467
  }
468
  }
469
- }, 250);
470
  }
471
 
472
  //
 
160
 
161
  <script type="text/javascript" src="helpers.js"></script>
162
  <script type='text/javascript'>
 
 
 
 
 
 
163
  // web audio context
164
  var context = null;
165
 
 
336
  // microphone
337
  //
338
 
339
+ const kSampleRate = 16000;
340
+ const kRestartRecording_s = 120;
341
+ const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
342
+
343
  var mediaRecorder = null;
344
  var doRecording = false;
345
  var startTime = 0;
346
 
347
+ window.AudioContext = window.AudioContext || window.webkitAudioContext;
348
+ window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
349
+
350
  function stopRecording() {
351
  Module.set_status("paused");
352
  doRecording = false;
 
358
  function startRecording() {
359
  if (!context) {
360
  context = new AudioContext({
361
+ sampleRate: kSampleRate,
362
  channelCount: 1,
363
  echoCancellation: false,
364
  autoGainControl: true,
 
432
  }
433
  };
434
 
435
+ mediaRecorder.start(kIntervalAudio_ms);
436
  })
437
  .catch(function(err) {
438
  printTextarea('js: error getting audio stream: ' + err);
 
467
  });
468
  }
469
  }
470
+ }, 100);
471
  }
472
 
473
  //
examples/whisper.wasm/index-tmpl.html CHANGED
@@ -225,12 +225,6 @@
225
  }
226
  };
227
 
228
- const kMaxAudio_s = 120;
229
- const kSampleRate = 16000;
230
-
231
- window.AudioContext = window.AudioContext || window.webkitAudioContext;
232
- window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
233
-
234
  // web audio context
235
  var context = null;
236
 
@@ -348,9 +342,21 @@
348
  // audio file
349
  //
350
 
 
 
 
 
 
 
351
  function loadAudio(event) {
352
  if (!context) {
353
- context = new AudioContext({sampleRate: 16000});
 
 
 
 
 
 
354
  }
355
 
356
  var file = event.target.files[0] || null;
@@ -410,7 +416,13 @@
410
  // update progress information
411
  function startRecording() {
412
  if (!context) {
413
- context = new AudioContext({sampleRate: 16000});
 
 
 
 
 
 
414
  }
415
 
416
  document.getElementById('start').disabled = true;
 
225
  }
226
  };
227
 
 
 
 
 
 
 
228
  // web audio context
229
  var context = null;
230
 
 
342
  // audio file
343
  //
344
 
345
+ const kMaxAudio_s = 120;
346
+ const kSampleRate = 16000;
347
+
348
+ window.AudioContext = window.AudioContext || window.webkitAudioContext;
349
+ window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
350
+
351
  function loadAudio(event) {
352
  if (!context) {
353
+ context = new AudioContext({
354
+ sampleRate: kSampleRate,
355
+ channelCount: 1,
356
+ echoCancellation: false,
357
+ autoGainControl: true,
358
+ noiseSuppression: true,
359
+ });
360
  }
361
 
362
  var file = event.target.files[0] || null;
 
416
  // update progress information
417
  function startRecording() {
418
  if (!context) {
419
+ context = new AudioContext({
420
+ sampleRate: kSampleRate,
421
+ channelCount: 1,
422
+ echoCancellation: false,
423
+ autoGainControl: true,
424
+ noiseSuppression: true,
425
+ });
426
  }
427
 
428
  document.getElementById('start').disabled = true;
extra/deploy-wasm.sh ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #
3
+ # This is a helper script to deploy all WebAssembly examples to my node
4
+ # Run from the build directory:
5
+ #
6
+ # cd build-em
7
+ # ../extra/deploy-wasm.sh
8
+ #
9
+
10
+ # check if emcmake is available
11
+ if ! command -v emcmake &> /dev/null
12
+ then
13
+ echo "Error: emscripten environment is not set up"
14
+ exit
15
+ fi
16
+
17
+ emcmake cmake .. && make -j
18
+ if [ $? -ne 0 ]; then
19
+ echo "Error: build failed"
20
+ exit
21
+ fi
22
+
23
+ # copy all wasm files to the node
24
+ scp bin/whisper.wasm/* root@linode0:/var/www/html/whisper/ && scp bin/libwhisper.worker.js root@linode0:/var/www/html/whisper/
25
+ scp bin/stream.wasm/* root@linode0:/var/www/html/whisper/stream/ && scp bin/libstream.worker.js root@linode0:/var/www/html/whisper/stream/
26
+ scp bin/command.wasm/* root@linode0:/var/www/html/whisper/command/ && scp bin/libcommand.worker.js root@linode0:/var/www/html/whisper/command/
27
+ scp bin/talk.wasm/* root@linode0:/var/www/html/whisper/talk/ && scp bin/libtalk.worker.js root@linode0:/var/www/html/whisper/talk/
28
+
29
+ echo "Done"
30
+ exit