ggerganov commited on
Commit
acbd6f7
·
unverified ·
1 Parent(s): 560e81f

examples : refactor in order to reuse code and reduce duplication (#482)

Browse files

* examples : refactor common code into a library

* examples : refactor common SDL code into a library

* make : update Makefile to use common libs

* common : fix MSVC M_PI ..

* addon.node : link common lib

Makefile CHANGED
@@ -197,18 +197,21 @@ clean:
197
 
198
  CC_SDL=`sdl2-config --cflags --libs`
199
 
200
- main: examples/main/main.cpp ggml.o whisper.o
201
- $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o whisper.o -o main $(LDFLAGS)
 
 
 
202
  ./main -h
203
 
204
- stream: examples/stream/stream.cpp ggml.o whisper.o
205
- $(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
206
 
207
- command: examples/command/command.cpp ggml.o whisper.o
208
- $(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
209
 
210
- talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp ggml.o whisper.o
211
- $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
212
 
213
  bench: examples/bench/bench.cpp ggml.o whisper.o
214
  $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
 
197
 
198
  CC_SDL=`sdl2-config --cflags --libs`
199
 
200
+ SRC_COMMON = examples/common.cpp
201
+ SRC_COMMON_SDL = examples/common-sdl.cpp
202
+
203
+ main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
204
+ $(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
205
  ./main -h
206
 
207
+ stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
208
+ $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
209
 
210
+ command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
211
+ $(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
212
 
213
+ talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
214
+ $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
215
 
216
  bench: examples/bench/bench.cpp ggml.o whisper.o
217
  $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
bindings/javascript/whisper.js CHANGED
The diff for this file is too large to render. See raw diff
 
examples/CMakeLists.txt CHANGED
@@ -14,6 +14,37 @@ if (WHISPER_SUPPORT_SDL2)
14
  message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
15
  endif()
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # examples
18
 
19
  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
14
  message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
15
  endif()
16
 
17
+ # common
18
+
19
+ set(TARGET common)
20
+
21
+ add_library(${TARGET} STATIC
22
+ common.h
23
+ common.cpp
24
+ )
25
+
26
+ include(DefaultTargetOptions)
27
+
28
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
29
+
30
+ if (WHISPER_SUPPORT_SDL2)
31
+ # common-sdl
32
+
33
+ set(TARGET common-sdl)
34
+
35
+ add_library(${TARGET} STATIC
36
+ common-sdl.h
37
+ common-sdl.cpp
38
+ )
39
+
40
+ include(DefaultTargetOptions)
41
+
42
+ target_include_directories(${TARGET} PUBLIC ${SDL2_INCLUDE_DIRS})
43
+ target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES})
44
+
45
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
46
+ endif()
47
+
48
  # examples
49
 
50
  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
examples/addon.node/CMakeLists.txt CHANGED
@@ -23,7 +23,7 @@ string(REPLACE "\"" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR})
23
  target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
24
  #==================================================================
25
 
26
- target_link_libraries(${TARGET} ${CMAKE_JS_LIB} whisper ${CMAKE_THREAD_LIBS_INIT})
27
 
28
  if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
29
  # Generate node.lib
 
23
  target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
24
  #==================================================================
25
 
26
+ target_link_libraries(${TARGET} ${CMAKE_JS_LIB} common whisper ${CMAKE_THREAD_LIBS_INIT})
27
 
28
  if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
29
  # Generate node.lib
examples/addon.node/addon.cpp CHANGED
@@ -1,15 +1,13 @@
1
- #include <cstdint>
 
 
 
 
2
  #include <string>
3
  #include <thread>
4
  #include <vector>
5
  #include <cmath>
6
-
7
- #include "napi.h"
8
-
9
- #define DR_WAV_IMPLEMENTATION
10
- #include "dr_wav.h"
11
-
12
- #include "whisper.h"
13
 
14
  struct whisper_params {
15
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
@@ -44,7 +42,7 @@ struct whisper_params {
44
  std::string model = "../../ggml-large.bin";
45
 
46
  std::vector<std::string> fname_inp = {};
47
- std::vector<std::string> fname_outp = {};
48
  };
49
 
50
  struct whisper_print_user_data {
@@ -143,7 +141,6 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
143
  }
144
 
145
  int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
146
-
147
  if (params.fname_inp.empty()) {
148
  fprintf(stderr, "error: no input files specified\n");
149
  return 2;
@@ -181,91 +178,14 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
181
 
182
  for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
183
  const auto fname_inp = params.fname_inp[f];
184
- const auto fname_outp = f < (int)params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
185
 
186
  std::vector<float> pcmf32; // mono-channel F32 PCM
187
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
188
 
189
- // WAV input
190
- {
191
- drwav wav;
192
- std::vector<uint8_t> wav_data; // used for pipe input from stdin
193
-
194
- if (fname_inp == "-") {
195
- {
196
- uint8_t buf[1024];
197
- while (true)
198
- {
199
- const size_t n = fread(buf, 1, sizeof(buf), stdin);
200
- if (n == 0) {
201
- break;
202
- }
203
- wav_data.insert(wav_data.end(), buf, buf + n);
204
- }
205
- }
206
-
207
- if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
208
- fprintf(stderr, "error: failed to open WAV file from stdin\n");
209
- return 4;
210
- }
211
-
212
- fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
213
- }
214
- else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
215
- fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
216
- return 5;
217
- }
218
-
219
- if (wav.channels != 1 && wav.channels != 2) {
220
- fprintf(stderr, "error: WAV file '%s' must be mono or stereo\n", fname_inp.c_str());
221
- return 6;
222
- }
223
-
224
- if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
225
- fprintf(stderr, "error: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str());
226
- return 6;
227
- }
228
-
229
- if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
230
- fprintf(stderr, "error: WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
231
- return 8;
232
- }
233
-
234
- if (wav.bitsPerSample != 16) {
235
- fprintf(stderr, "error: WAV file '%s' must be 16-bit\n", fname_inp.c_str());
236
- return 9;
237
- }
238
-
239
- const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
240
-
241
- std::vector<int16_t> pcm16;
242
- pcm16.resize(n*wav.channels);
243
- drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
244
- drwav_uninit(&wav);
245
-
246
- // convert to mono, float
247
- pcmf32.resize(n);
248
- if (wav.channels == 1) {
249
- for (uint64_t i = 0; i < n; i++) {
250
- pcmf32[i] = float(pcm16[i])/32768.0f;
251
- }
252
- } else {
253
- for (uint64_t i = 0; i < n; i++) {
254
- pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
255
- }
256
- }
257
-
258
- if (params.diarize) {
259
- // convert to stereo, float
260
- pcmf32s.resize(2);
261
-
262
- pcmf32s[0].resize(n);
263
- pcmf32s[1].resize(n);
264
- for (uint64_t i = 0; i < n; i++) {
265
- pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
266
- pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
267
- }
268
- }
269
  }
270
 
271
  // print system information
 
1
+ #include "napi.h"
2
+ #include "common.h"
3
+
4
+ #include "whisper.h"
5
+
6
  #include <string>
7
  #include <thread>
8
  #include <vector>
9
  #include <cmath>
10
+ #include <cstdint>
 
 
 
 
 
 
11
 
12
  struct whisper_params {
13
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
 
42
  std::string model = "../../ggml-large.bin";
43
 
44
  std::vector<std::string> fname_inp = {};
45
+ std::vector<std::string> fname_out = {};
46
  };
47
 
48
  struct whisper_print_user_data {
 
141
  }
142
 
143
  int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
 
144
  if (params.fname_inp.empty()) {
145
  fprintf(stderr, "error: no input files specified\n");
146
  return 2;
 
178
 
179
  for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
180
  const auto fname_inp = params.fname_inp[f];
181
+ const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
182
 
183
  std::vector<float> pcmf32; // mono-channel F32 PCM
184
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
185
 
186
+ if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
187
+ fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
188
+ continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  }
190
 
191
  // print system information
examples/command.wasm/CMakeLists.txt CHANGED
@@ -11,6 +11,7 @@ add_executable(${TARGET}
11
  include(DefaultTargetOptions)
12
 
13
  target_link_libraries(${TARGET} PRIVATE
 
14
  whisper
15
  )
16
 
 
11
  include(DefaultTargetOptions)
12
 
13
  target_link_libraries(${TARGET} PRIVATE
14
+ common
15
  whisper
16
  )
17
 
examples/command.wasm/emscripten.cpp CHANGED
@@ -1,4 +1,5 @@
1
  #include "ggml.h"
 
2
  #include "whisper.h"
3
 
4
  #include <emscripten.h>
@@ -27,24 +28,6 @@ std::string g_transcribed = "";
27
 
28
  std::vector<float> g_pcmf32;
29
 
30
- static std::string trim(const std::string & s) {
31
- std::regex e("^\\s+|\\s+$");
32
- return std::regex_replace(s, e, "");
33
- }
34
-
35
- static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
36
- const float rc = 1.0f / (2.0f * M_PI * cutoff);
37
- const float dt = 1.0f / sample_rate;
38
- const float alpha = dt / (rc + dt);
39
-
40
- float y = data[0];
41
-
42
- for (size_t i = 1; i < data.size(); i++) {
43
- y = alpha * (y + data[i] - data[i - 1]);
44
- data[i] = y;
45
- }
46
- }
47
-
48
  // compute similarity between two strings using Levenshtein distance
49
  static float similarity(const std::string & s0, const std::string & s1) {
50
  const size_t len0 = s0.size() + 1;
@@ -75,44 +58,6 @@ void command_set_status(const std::string & status) {
75
  g_status = status;
76
  }
77
 
78
- bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
79
- const int n_samples = pcmf32.size();
80
- const int n_samples_last = (sample_rate * last_ms) / 1000;
81
-
82
- if (n_samples_last >= n_samples) {
83
- // not enough samples - assume no speech
84
- return false;
85
- }
86
-
87
- if (freq_thold > 0.0f) {
88
- high_pass_filter(pcmf32, freq_thold, sample_rate);
89
- }
90
-
91
- float energy_all = 0.0f;
92
- float energy_last = 0.0f;
93
-
94
- for (size_t i = 0; i < n_samples; i++) {
95
- energy_all += fabsf(pcmf32[i]);
96
-
97
- if (i >= n_samples - n_samples_last) {
98
- energy_last += fabsf(pcmf32[i]);
99
- }
100
- }
101
-
102
- energy_all /= n_samples;
103
- energy_last /= n_samples_last;
104
-
105
- if (verbose) {
106
- fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
107
- }
108
-
109
- if (energy_last > vad_thold*energy_all) {
110
- return false;
111
- }
112
-
113
- return true;
114
- }
115
-
116
  std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
117
  const auto t_start = std::chrono::high_resolution_clock::now();
118
 
@@ -155,7 +100,7 @@ void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
155
  const int64_t n_samples = (ms * sample_rate) / 1000;
156
 
157
  int64_t n_take = 0;
158
- if (g_pcmf32.size() < n_samples) {
159
  n_take = g_pcmf32.size();
160
  } else {
161
  n_take = n_samples;
@@ -187,7 +132,6 @@ void command_main(size_t index) {
187
 
188
  printf("command: using %d threads\n", wparams.n_threads);
189
 
190
- bool is_running = true;
191
  bool have_prompt = false;
192
  bool ask_prompt = true;
193
  bool print_energy = false;
@@ -233,7 +177,7 @@ void command_main(size_t index) {
233
  {
234
  command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
235
 
236
- if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
237
  fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
238
  command_set_status("Speech detected! Processing ...");
239
 
 
1
  #include "ggml.h"
2
+ #include "common.h"
3
  #include "whisper.h"
4
 
5
  #include <emscripten.h>
 
28
 
29
  std::vector<float> g_pcmf32;
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  // compute similarity between two strings using Levenshtein distance
32
  static float similarity(const std::string & s0, const std::string & s1) {
33
  const size_t len0 = s0.size() + 1;
 
58
  g_status = status;
59
  }
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
62
  const auto t_start = std::chrono::high_resolution_clock::now();
63
 
 
100
  const int64_t n_samples = (ms * sample_rate) / 1000;
101
 
102
  int64_t n_take = 0;
103
+ if (n_samples > (int) g_pcmf32.size()) {
104
  n_take = g_pcmf32.size();
105
  } else {
106
  n_take = n_samples;
 
132
 
133
  printf("command: using %d threads\n", wparams.n_threads);
134
 
 
135
  bool have_prompt = false;
136
  bool ask_prompt = true;
137
  bool print_energy = false;
 
177
  {
178
  command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
179
 
180
+ if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
181
  fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
182
  command_set_status("Speech detected! Processing ...");
183
 
examples/command/CMakeLists.txt CHANGED
@@ -5,6 +5,5 @@ if (WHISPER_SUPPORT_SDL2)
5
 
6
  include(DefaultTargetOptions)
7
 
8
- target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
9
- target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
10
  endif ()
 
5
 
6
  include(DefaultTargetOptions)
7
 
8
+ target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
 
9
  endif ()
examples/command/command.cpp CHANGED
@@ -6,11 +6,10 @@
6
  // ref: https://github.com/ggerganov/whisper.cpp/issues/171
7
  //
8
 
 
 
9
  #include "whisper.h"
10
 
11
- #include <SDL.h>
12
- #include <SDL_audio.h>
13
-
14
  #include <sstream>
15
  #include <cassert>
16
  #include <cstdio>
@@ -110,309 +109,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
110
  fprintf(stderr, "\n");
111
  }
112
 
113
- //
114
- // SDL Audio capture
115
- //
116
-
117
- class audio_async {
118
- public:
119
- audio_async(int len_ms);
120
- ~audio_async();
121
-
122
- bool init(int capture_id, int sample_rate);
123
-
124
- // start capturing audio via the provided SDL callback
125
- // keep last len_ms seconds of audio in a circular buffer
126
- bool resume();
127
- bool pause();
128
- bool clear();
129
-
130
- // callback to be called by SDL
131
- void callback(uint8_t * stream, int len);
132
-
133
- // get audio data from the circular buffer
134
- void get(int ms, std::vector<float> & audio);
135
-
136
- private:
137
- SDL_AudioDeviceID m_dev_id_in = 0;
138
-
139
- int m_len_ms = 0;
140
- int m_sample_rate = 0;
141
-
142
- bool m_running = false;
143
- std::mutex m_mutex;
144
-
145
- std::vector<float> m_audio;
146
- std::vector<float> m_audio_new;
147
- size_t m_audio_pos = 0;
148
- size_t m_audio_len = 0;
149
- };
150
-
151
- audio_async::audio_async(int len_ms) {
152
- m_len_ms = len_ms;
153
- }
154
-
155
- audio_async::~audio_async() {
156
- if (m_dev_id_in) {
157
- SDL_CloseAudioDevice(m_dev_id_in);
158
- }
159
- }
160
-
161
- bool audio_async::init(int capture_id, int sample_rate) {
162
- SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
163
-
164
- if (SDL_Init(SDL_INIT_AUDIO) < 0) {
165
- SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
166
- return false;
167
- }
168
-
169
- SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
170
-
171
- {
172
- int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
173
- fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
174
- for (int i = 0; i < nDevices; i++) {
175
- fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
176
- }
177
- }
178
-
179
- SDL_AudioSpec capture_spec_requested;
180
- SDL_AudioSpec capture_spec_obtained;
181
-
182
- SDL_zero(capture_spec_requested);
183
- SDL_zero(capture_spec_obtained);
184
-
185
- capture_spec_requested.freq = sample_rate;
186
- capture_spec_requested.format = AUDIO_F32;
187
- capture_spec_requested.channels = 1;
188
- capture_spec_requested.samples = 1024;
189
- capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
190
- audio_async * audio = (audio_async *) userdata;
191
- audio->callback(stream, len);
192
- };
193
- capture_spec_requested.userdata = this;
194
-
195
- if (capture_id >= 0) {
196
- fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
197
- m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
198
- } else {
199
- fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
200
- m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
201
- }
202
-
203
- if (!m_dev_id_in) {
204
- fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
205
- m_dev_id_in = 0;
206
-
207
- return false;
208
- } else {
209
- fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
210
- fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
211
- fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format,
212
- capture_spec_requested.format);
213
- fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels,
214
- capture_spec_requested.channels);
215
- fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
216
- }
217
-
218
- m_sample_rate = capture_spec_obtained.freq;
219
-
220
- m_audio.resize((m_sample_rate*m_len_ms)/1000);
221
-
222
- return true;
223
- }
224
-
225
- bool audio_async::resume() {
226
- if (!m_dev_id_in) {
227
- fprintf(stderr, "%s: no audio device to resume!\n", __func__);
228
- return false;
229
- }
230
-
231
- if (m_running) {
232
- fprintf(stderr, "%s: already running!\n", __func__);
233
- return false;
234
- }
235
-
236
- SDL_PauseAudioDevice(m_dev_id_in, 0);
237
-
238
- m_running = true;
239
-
240
- return true;
241
- }
242
-
243
- bool audio_async::pause() {
244
- if (!m_dev_id_in) {
245
- fprintf(stderr, "%s: no audio device to pause!\n", __func__);
246
- return false;
247
- }
248
-
249
- if (!m_running) {
250
- fprintf(stderr, "%s: already paused!\n", __func__);
251
- return false;
252
- }
253
-
254
- SDL_PauseAudioDevice(m_dev_id_in, 1);
255
-
256
- m_running = false;
257
-
258
- return true;
259
- }
260
-
261
- bool audio_async::clear() {
262
- if (!m_dev_id_in) {
263
- fprintf(stderr, "%s: no audio device to clear!\n", __func__);
264
- return false;
265
- }
266
-
267
- if (!m_running) {
268
- fprintf(stderr, "%s: not running!\n", __func__);
269
- return false;
270
- }
271
-
272
- {
273
- std::lock_guard<std::mutex> lock(m_mutex);
274
-
275
- m_audio_pos = 0;
276
- m_audio_len = 0;
277
- }
278
-
279
- return true;
280
- }
281
-
282
- // callback to be called by SDL
283
- void audio_async::callback(uint8_t * stream, int len) {
284
- if (!m_running) {
285
- return;
286
- }
287
-
288
- const size_t n_samples = len / sizeof(float);
289
-
290
- m_audio_new.resize(n_samples);
291
- memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
292
-
293
- //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
294
-
295
- {
296
- std::lock_guard<std::mutex> lock(m_mutex);
297
-
298
- if (m_audio_pos + n_samples > m_audio.size()) {
299
- const size_t n0 = m_audio.size() - m_audio_pos;
300
-
301
- memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
302
- memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
303
-
304
- m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
305
- m_audio_len = m_audio.size();
306
- } else {
307
- memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
308
-
309
- m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
310
- m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
311
- }
312
- }
313
- }
314
-
315
- void audio_async::get(int ms, std::vector<float> & result) {
316
- if (!m_dev_id_in) {
317
- fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
318
- return;
319
- }
320
-
321
- if (!m_running) {
322
- fprintf(stderr, "%s: not running!\n", __func__);
323
- return;
324
- }
325
-
326
- result.clear();
327
-
328
- {
329
- std::lock_guard<std::mutex> lock(m_mutex);
330
-
331
- if (ms <= 0) {
332
- ms = m_len_ms;
333
- }
334
-
335
- size_t n_samples = (m_sample_rate * ms) / 1000;
336
- if (n_samples > m_audio_len) {
337
- n_samples = m_audio_len;
338
- }
339
-
340
- result.resize(n_samples);
341
-
342
- int s0 = m_audio_pos - n_samples;
343
- if (s0 < 0) {
344
- s0 += m_audio.size();
345
- }
346
-
347
- if (s0 + n_samples > m_audio.size()) {
348
- const size_t n0 = m_audio.size() - s0;
349
-
350
- memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
351
- memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
352
- } else {
353
- memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
354
- }
355
- }
356
- }
357
-
358
- ///////////////////////////
359
-
360
- std::string trim(const std::string & s) {
361
- std::regex e("^\\s+|\\s+$");
362
- return std::regex_replace(s, e, "");
363
- }
364
-
365
- void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
366
- const float rc = 1.0f / (2.0f * M_PI * cutoff);
367
- const float dt = 1.0f / sample_rate;
368
- const float alpha = dt / (rc + dt);
369
-
370
- float y = data[0];
371
-
372
- for (size_t i = 1; i < data.size(); i++) {
373
- y = alpha * (y + data[i] - data[i - 1]);
374
- data[i] = y;
375
- }
376
- }
377
-
378
- bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
379
- const int n_samples = pcmf32.size();
380
- const int n_samples_last = (sample_rate * last_ms) / 1000;
381
-
382
- if (n_samples_last >= n_samples) {
383
- // not enough samples - assume no speech
384
- return false;
385
- }
386
-
387
- if (freq_thold > 0.0f) {
388
- high_pass_filter(pcmf32, freq_thold, sample_rate);
389
- }
390
-
391
- float energy_all = 0.0f;
392
- float energy_last = 0.0f;
393
-
394
- for (int i = 0; i < n_samples; i++) {
395
- energy_all += fabsf(pcmf32[i]);
396
-
397
- if (i >= n_samples - n_samples_last) {
398
- energy_last += fabsf(pcmf32[i]);
399
- }
400
- }
401
-
402
- energy_all /= n_samples;
403
- energy_last /= n_samples_last;
404
-
405
- if (verbose) {
406
- fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
407
- }
408
-
409
- if (energy_last > vad_thold*energy_all) {
410
- return false;
411
- }
412
-
413
- return true;
414
- }
415
-
416
  std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
417
  const auto t_start = std::chrono::high_resolution_clock::now();
418
 
@@ -502,7 +198,7 @@ std::vector<std::string> read_allowed_commands(const std::string & fname) {
502
 
503
  std::string line;
504
  while (std::getline(ifs, line)) {
505
- line = trim(line);
506
  if (line.empty()) {
507
  continue;
508
  }
@@ -526,23 +222,6 @@ std::vector<std::string> get_words(const std::string &txt) {
526
  return words;
527
  }
528
 
529
- // returns true if no exit event was received
530
- bool process_sdl_events() {
531
- SDL_Event event;
532
- while (SDL_PollEvent(&event)) {
533
- switch (event.type) {
534
- case SDL_QUIT:
535
- {
536
- return false;
537
- } break;
538
- default:
539
- break;
540
- }
541
- }
542
-
543
- return true;
544
- }
545
-
546
  // command-list mode
547
  // guide the transcription to match the most likely command from a provided list
548
  int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
@@ -634,14 +313,14 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
634
  // main loop
635
  while (is_running) {
636
  // handle Ctrl + C
637
- is_running = process_sdl_events();
638
 
639
  // delay
640
  std::this_thread::sleep_for(std::chrono::milliseconds(100));
641
 
642
  audio.get(2000, pcmf32_cur);
643
 
644
- if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
645
  fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
646
 
647
  const auto t_start = std::chrono::high_resolution_clock::now();
@@ -775,7 +454,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
775
  // main loop
776
  while (is_running) {
777
  // handle Ctrl + C
778
- is_running = process_sdl_events();
779
 
780
  // delay
781
  std::this_thread::sleep_for(std::chrono::milliseconds(100));
@@ -791,7 +470,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
791
  {
792
  audio.get(2000, pcmf32_cur);
793
 
794
- if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
795
  fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
796
 
797
  int64_t t_ms = 0;
@@ -854,7 +533,7 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
854
  // main loop
855
  while (is_running) {
856
  // handle Ctrl + C
857
- is_running = process_sdl_events();
858
 
859
  // delay
860
  std::this_thread::sleep_for(std::chrono::milliseconds(100));
@@ -870,7 +549,7 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
870
  {
871
  audio.get(2000, pcmf32_cur);
872
 
873
- if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
874
  fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
875
 
876
  int64_t t_ms = 0;
 
6
  // ref: https://github.com/ggerganov/whisper.cpp/issues/171
7
  //
8
 
9
+ #include "common.h"
10
+ #include "common-sdl.h"
11
  #include "whisper.h"
12
 
 
 
 
13
  #include <sstream>
14
  #include <cassert>
15
  #include <cstdio>
 
109
  fprintf(stderr, "\n");
110
  }
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
113
  const auto t_start = std::chrono::high_resolution_clock::now();
114
 
 
198
 
199
  std::string line;
200
  while (std::getline(ifs, line)) {
201
+ line = ::trim(line);
202
  if (line.empty()) {
203
  continue;
204
  }
 
222
  return words;
223
  }
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  // command-list mode
226
  // guide the transcription to match the most likely command from a provided list
227
  int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
 
313
  // main loop
314
  while (is_running) {
315
  // handle Ctrl + C
316
+ is_running = sdl_poll_events();
317
 
318
  // delay
319
  std::this_thread::sleep_for(std::chrono::milliseconds(100));
320
 
321
  audio.get(2000, pcmf32_cur);
322
 
323
+ if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
324
  fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
325
 
326
  const auto t_start = std::chrono::high_resolution_clock::now();
 
454
  // main loop
455
  while (is_running) {
456
  // handle Ctrl + C
457
+ is_running = sdl_poll_events();
458
 
459
  // delay
460
  std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
470
  {
471
  audio.get(2000, pcmf32_cur);
472
 
473
+ if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
474
  fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
475
 
476
  int64_t t_ms = 0;
 
533
  // main loop
534
  while (is_running) {
535
  // handle Ctrl + C
536
+ is_running = sdl_poll_events();
537
 
538
  // delay
539
  std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
549
  {
550
  audio.get(2000, pcmf32_cur);
551
 
552
+ if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
553
  fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
554
 
555
  int64_t t_ms = 0;
examples/common-sdl.cpp ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common-sdl.h"
2
+
3
+ audio_async::audio_async(int len_ms) {
4
+ m_len_ms = len_ms;
5
+
6
+ m_running = false;
7
+ }
8
+
9
+ audio_async::~audio_async() {
10
+ if (m_dev_id_in) {
11
+ SDL_CloseAudioDevice(m_dev_id_in);
12
+ }
13
+ }
14
+
15
+ bool audio_async::init(int capture_id, int sample_rate) {
16
+ SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
17
+
18
+ if (SDL_Init(SDL_INIT_AUDIO) < 0) {
19
+ SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
20
+ return false;
21
+ }
22
+
23
+ SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
24
+
25
+ {
26
+ int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
27
+ fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
28
+ for (int i = 0; i < nDevices; i++) {
29
+ fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
30
+ }
31
+ }
32
+
33
+ SDL_AudioSpec capture_spec_requested;
34
+ SDL_AudioSpec capture_spec_obtained;
35
+
36
+ SDL_zero(capture_spec_requested);
37
+ SDL_zero(capture_spec_obtained);
38
+
39
+ capture_spec_requested.freq = sample_rate;
40
+ capture_spec_requested.format = AUDIO_F32;
41
+ capture_spec_requested.channels = 1;
42
+ capture_spec_requested.samples = 1024;
43
+ capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
44
+ audio_async * audio = (audio_async *) userdata;
45
+ audio->callback(stream, len);
46
+ };
47
+ capture_spec_requested.userdata = this;
48
+
49
+ if (capture_id >= 0) {
50
+ fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
51
+ m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
52
+ } else {
53
+ fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
54
+ m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
55
+ }
56
+
57
+ if (!m_dev_id_in) {
58
+ fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
59
+ m_dev_id_in = 0;
60
+
61
+ return false;
62
+ } else {
63
+ fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
64
+ fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
65
+ fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format,
66
+ capture_spec_requested.format);
67
+ fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels,
68
+ capture_spec_requested.channels);
69
+ fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
70
+ }
71
+
72
+ m_sample_rate = capture_spec_obtained.freq;
73
+
74
+ m_audio.resize((m_sample_rate*m_len_ms)/1000);
75
+
76
+ return true;
77
+ }
78
+
79
+ bool audio_async::resume() {
80
+ if (!m_dev_id_in) {
81
+ fprintf(stderr, "%s: no audio device to resume!\n", __func__);
82
+ return false;
83
+ }
84
+
85
+ if (m_running) {
86
+ fprintf(stderr, "%s: already running!\n", __func__);
87
+ return false;
88
+ }
89
+
90
+ SDL_PauseAudioDevice(m_dev_id_in, 0);
91
+
92
+ m_running = true;
93
+
94
+ return true;
95
+ }
96
+
97
+ bool audio_async::pause() {
98
+ if (!m_dev_id_in) {
99
+ fprintf(stderr, "%s: no audio device to pause!\n", __func__);
100
+ return false;
101
+ }
102
+
103
+ if (!m_running) {
104
+ fprintf(stderr, "%s: already paused!\n", __func__);
105
+ return false;
106
+ }
107
+
108
+ SDL_PauseAudioDevice(m_dev_id_in, 1);
109
+
110
+ m_running = false;
111
+
112
+ return true;
113
+ }
114
+
115
+ bool audio_async::clear() {
116
+ if (!m_dev_id_in) {
117
+ fprintf(stderr, "%s: no audio device to clear!\n", __func__);
118
+ return false;
119
+ }
120
+
121
+ if (!m_running) {
122
+ fprintf(stderr, "%s: not running!\n", __func__);
123
+ return false;
124
+ }
125
+
126
+ {
127
+ std::lock_guard<std::mutex> lock(m_mutex);
128
+
129
+ m_audio_pos = 0;
130
+ m_audio_len = 0;
131
+ }
132
+
133
+ return true;
134
+ }
135
+
136
+ // callback to be called by SDL
137
+ void audio_async::callback(uint8_t * stream, int len) {
138
+ if (!m_running) {
139
+ return;
140
+ }
141
+
142
+ const size_t n_samples = len / sizeof(float);
143
+
144
+ m_audio_new.resize(n_samples);
145
+ memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
146
+
147
+ //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
148
+
149
+ {
150
+ std::lock_guard<std::mutex> lock(m_mutex);
151
+
152
+ if (m_audio_pos + n_samples > m_audio.size()) {
153
+ const size_t n0 = m_audio.size() - m_audio_pos;
154
+
155
+ memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
156
+ memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
157
+
158
+ m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
159
+ m_audio_len = m_audio.size();
160
+ } else {
161
+ memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
162
+
163
+ m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
164
+ m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
165
+ }
166
+ }
167
+ }
168
+
169
+ void audio_async::get(int ms, std::vector<float> & result) {
170
+ if (!m_dev_id_in) {
171
+ fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
172
+ return;
173
+ }
174
+
175
+ if (!m_running) {
176
+ fprintf(stderr, "%s: not running!\n", __func__);
177
+ return;
178
+ }
179
+
180
+ result.clear();
181
+
182
+ {
183
+ std::lock_guard<std::mutex> lock(m_mutex);
184
+
185
+ if (ms <= 0) {
186
+ ms = m_len_ms;
187
+ }
188
+
189
+ size_t n_samples = (m_sample_rate * ms) / 1000;
190
+ if (n_samples > m_audio_len) {
191
+ n_samples = m_audio_len;
192
+ }
193
+
194
+ result.resize(n_samples);
195
+
196
+ int s0 = m_audio_pos - n_samples;
197
+ if (s0 < 0) {
198
+ s0 += m_audio.size();
199
+ }
200
+
201
+ if (s0 + n_samples > m_audio.size()) {
202
+ const size_t n0 = m_audio.size() - s0;
203
+
204
+ memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
205
+ memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
206
+ } else {
207
+ memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
208
+ }
209
+ }
210
+ }
211
+
212
+ bool sdl_poll_events() {
213
+ SDL_Event event;
214
+ while (SDL_PollEvent(&event)) {
215
+ switch (event.type) {
216
+ case SDL_QUIT:
217
+ {
218
+ return false;
219
+ } break;
220
+ default:
221
+ break;
222
+ }
223
+ }
224
+
225
+ return true;
226
+ }
examples/common-sdl.h ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <SDL.h>
4
+ #include <SDL_audio.h>
5
+
6
+ #include <atomic>
7
+ #include <cstdint>
8
+ #include <vector>
9
+ #include <mutex>
10
+
11
+ //
12
+ // SDL Audio capture
13
+ //
14
+
15
+ class audio_async {
16
+ public:
17
+ audio_async(int len_ms);
18
+ ~audio_async();
19
+
20
+ bool init(int capture_id, int sample_rate);
21
+
22
+ // start capturing audio via the provided SDL callback
23
+ // keep last len_ms seconds of audio in a circular buffer
24
+ bool resume();
25
+ bool pause();
26
+ bool clear();
27
+
28
+ // callback to be called by SDL
29
+ void callback(uint8_t * stream, int len);
30
+
31
+ // get audio data from the circular buffer
32
+ void get(int ms, std::vector<float> & audio);
33
+
34
+ private:
35
+ SDL_AudioDeviceID m_dev_id_in = 0;
36
+
37
+ int m_len_ms = 0;
38
+ int m_sample_rate = 0;
39
+
40
+ std::atomic_bool m_running;
41
+ std::mutex m_mutex;
42
+
43
+ std::vector<float> m_audio;
44
+ std::vector<float> m_audio_new;
45
+ size_t m_audio_pos = 0;
46
+ size_t m_audio_len = 0;
47
+ };
48
+
49
+ // Return false if need to quit
50
+ bool sdl_poll_events();
examples/common.cpp ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.h"
2
+
3
+ // third-party utilities
4
+ // use your favorite implementations
5
+ #define DR_WAV_IMPLEMENTATION
6
+ #include "dr_wav.h"
7
+
8
+ #include <cmath>
9
+ #include <regex>
10
+
11
+ #ifndef M_PI
12
+ #define M_PI 3.14159265358979323846
13
+ #endif
14
+
15
+ std::string trim(const std::string & s) {
16
+ std::regex e("^\\s+|\\s+$");
17
+ return std::regex_replace(s, e, "");
18
+ }
19
+
20
+ std::string replace(const std::string & s, const std::string & from, const std::string & to) {
21
+ std::string result = s;
22
+ size_t pos = 0;
23
+ while ((pos = result.find(from, pos)) != std::string::npos) {
24
+ result.replace(pos, from.length(), to);
25
+ pos += to.length();
26
+ }
27
+ return result;
28
+ }
29
+
30
+ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
31
+ drwav wav;
32
+ std::vector<uint8_t> wav_data; // used for pipe input from stdin
33
+
34
+ if (fname == "-") {
35
+ {
36
+ uint8_t buf[1024];
37
+ while (true)
38
+ {
39
+ const size_t n = fread(buf, 1, sizeof(buf), stdin);
40
+ if (n == 0) {
41
+ break;
42
+ }
43
+ wav_data.insert(wav_data.end(), buf, buf + n);
44
+ }
45
+ }
46
+
47
+ if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
48
+ fprintf(stderr, "error: failed to open WAV file from stdin\n");
49
+ return false;
50
+ }
51
+
52
+ fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
53
+ }
54
+ else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
55
+ fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
56
+ return false;
57
+ }
58
+
59
+ if (wav.channels != 1 && wav.channels != 2) {
60
+ fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
61
+ return false;
62
+ }
63
+
64
+ if (stereo && wav.channels != 2) {
65
+ fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
66
+ return false;
67
+ }
68
+
69
+ if (wav.sampleRate != COMMON_SAMPLE_RATE) {
70
+ fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
71
+ return false;
72
+ }
73
+
74
+ if (wav.bitsPerSample != 16) {
75
+ fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
76
+ return false;
77
+ }
78
+
79
+ const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
80
+
81
+ std::vector<int16_t> pcm16;
82
+ pcm16.resize(n*wav.channels);
83
+ drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
84
+ drwav_uninit(&wav);
85
+
86
+ // convert to mono, float
87
+ pcmf32.resize(n);
88
+ if (wav.channels == 1) {
89
+ for (uint64_t i = 0; i < n; i++) {
90
+ pcmf32[i] = float(pcm16[i])/32768.0f;
91
+ }
92
+ } else {
93
+ for (uint64_t i = 0; i < n; i++) {
94
+ pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
95
+ }
96
+ }
97
+
98
+ if (stereo) {
99
+ // convert to stereo, float
100
+ pcmf32s.resize(2);
101
+
102
+ pcmf32s[0].resize(n);
103
+ pcmf32s[1].resize(n);
104
+ for (uint64_t i = 0; i < n; i++) {
105
+ pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
106
+ pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
107
+ }
108
+ }
109
+
110
+ return true;
111
+ }
112
+
113
+ void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
114
+ const float rc = 1.0f / (2.0f * M_PI * cutoff);
115
+ const float dt = 1.0f / sample_rate;
116
+ const float alpha = dt / (rc + dt);
117
+
118
+ float y = data[0];
119
+
120
+ for (size_t i = 1; i < data.size(); i++) {
121
+ y = alpha * (y + data[i] - data[i - 1]);
122
+ data[i] = y;
123
+ }
124
+ }
125
+
126
+ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
127
+ const int n_samples = pcmf32.size();
128
+ const int n_samples_last = (sample_rate * last_ms) / 1000;
129
+
130
+ if (n_samples_last >= n_samples) {
131
+ // not enough samples - assume no speech
132
+ return false;
133
+ }
134
+
135
+ if (freq_thold > 0.0f) {
136
+ high_pass_filter(pcmf32, freq_thold, sample_rate);
137
+ }
138
+
139
+ float energy_all = 0.0f;
140
+ float energy_last = 0.0f;
141
+
142
+ for (int i = 0; i < n_samples; i++) {
143
+ energy_all += fabsf(pcmf32[i]);
144
+
145
+ if (i >= n_samples - n_samples_last) {
146
+ energy_last += fabsf(pcmf32[i]);
147
+ }
148
+ }
149
+
150
+ energy_all /= n_samples;
151
+ energy_last /= n_samples_last;
152
+
153
+ if (verbose) {
154
+ fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
155
+ }
156
+
157
+ if (energy_last > vad_thold*energy_all) {
158
+ return false;
159
+ }
160
+
161
+ return true;
162
+ }
examples/common.h ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // needs to match WHISPER_SAMPLE_RATE
4
+ #define COMMON_SAMPLE_RATE 16000
5
+
6
+ #include <vector>
7
+ #include <string>
8
+
9
+ std::string trim(const std::string & s);
10
+
11
+ std::string replace(
12
+ const std::string & s,
13
+ const std::string & from,
14
+ const std::string & to);
15
+
16
+ // Read WAV audio file and store the PCM data into pcmf32
17
+ // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
18
+ // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
19
+ bool read_wav(
20
+ const std::string & fname,
21
+ std::vector<float> & pcmf32,
22
+ std::vector<std::vector<float>> & pcmf32s,
23
+ bool stereo);
24
+
25
+ // Apply a high-pass frequency filter to PCM audio
26
+ // Suppresses frequencies below cutoff Hz
27
+ void high_pass_filter(
28
+ std::vector<float> & data,
29
+ float cutoff,
30
+ float sample_rate);
31
+
32
+ // Basic voice activity detection (VAD) using audio energy adaptive threshold
33
+ bool vad_simple(
34
+ std::vector<float> & pcmf32,
35
+ int sample_rate,
36
+ int last_ms,
37
+ float vad_thold,
38
+ float freq_thold,
39
+ bool verbose);
40
+
examples/main/CMakeLists.txt CHANGED
@@ -3,4 +3,4 @@ add_executable(${TARGET} main.cpp)
3
 
4
  include(DefaultTargetOptions)
5
 
6
- target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
 
3
 
4
  include(DefaultTargetOptions)
5
 
6
+ target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
examples/main/main.cpp CHANGED
@@ -1,9 +1,6 @@
1
- #include "whisper.h"
2
 
3
- // third-party utilities
4
- // use your favorite implementations
5
- #define DR_WAV_IMPLEMENTATION
6
- #include "dr_wav.h"
7
 
8
  #include <cmath>
9
  #include <fstream>
@@ -86,7 +83,7 @@ struct whisper_params {
86
  std::string model = "models/ggml-base.en.bin";
87
 
88
  std::vector<std::string> fname_inp = {};
89
- std::vector<std::string> fname_outp = {};
90
  };
91
 
92
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -126,7 +123,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
126
  else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
127
  else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
128
  else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
129
- else if (arg == "-of" || arg == "--output-file") { params.fname_outp.emplace_back(argv[++i]); }
130
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
131
  else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
132
  else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
@@ -520,91 +517,14 @@ int main(int argc, char ** argv) {
520
 
521
  for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
522
  const auto fname_inp = params.fname_inp[f];
523
- const auto fname_outp = f < (int) params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
524
 
525
- std::vector<float> pcmf32; // mono-channel F32 PCM
526
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
527
 
528
- // WAV input
529
- {
530
- drwav wav;
531
- std::vector<uint8_t> wav_data; // used for pipe input from stdin
532
-
533
- if (fname_inp == "-") {
534
- {
535
- uint8_t buf[1024];
536
- while (true)
537
- {
538
- const size_t n = fread(buf, 1, sizeof(buf), stdin);
539
- if (n == 0) {
540
- break;
541
- }
542
- wav_data.insert(wav_data.end(), buf, buf + n);
543
- }
544
- }
545
-
546
- if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
547
- fprintf(stderr, "error: failed to open WAV file from stdin\n");
548
- return 4;
549
- }
550
-
551
- fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
552
- }
553
- else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
554
- fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
555
- return 5;
556
- }
557
-
558
- if (wav.channels != 1 && wav.channels != 2) {
559
- fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
560
- return 6;
561
- }
562
-
563
- if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
564
- fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
565
- return 6;
566
- }
567
-
568
- if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
569
- fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", argv[0], fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
570
- return 8;
571
- }
572
-
573
- if (wav.bitsPerSample != 16) {
574
- fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
575
- return 9;
576
- }
577
-
578
- const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
579
-
580
- std::vector<int16_t> pcm16;
581
- pcm16.resize(n*wav.channels);
582
- drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
583
- drwav_uninit(&wav);
584
-
585
- // convert to mono, float
586
- pcmf32.resize(n);
587
- if (wav.channels == 1) {
588
- for (uint64_t i = 0; i < n; i++) {
589
- pcmf32[i] = float(pcm16[i])/32768.0f;
590
- }
591
- } else {
592
- for (uint64_t i = 0; i < n; i++) {
593
- pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
594
- }
595
- }
596
-
597
- if (params.diarize) {
598
- // convert to stereo, float
599
- pcmf32s.resize(2);
600
-
601
- pcmf32s[0].resize(n);
602
- pcmf32s[1].resize(n);
603
- for (uint64_t i = 0; i < n; i++) {
604
- pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
605
- pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
606
- }
607
- }
608
  }
609
 
610
  // print system information
@@ -701,34 +621,33 @@ int main(int argc, char ** argv) {
701
 
702
  // output to text file
703
  if (params.output_txt) {
704
- const auto fname_txt = fname_outp + ".txt";
705
  output_txt(ctx, fname_txt.c_str());
706
  }
707
 
708
  // output to VTT file
709
  if (params.output_vtt) {
710
- const auto fname_vtt = fname_outp + ".vtt";
711
  output_vtt(ctx, fname_vtt.c_str());
712
  }
713
 
714
  // output to SRT file
715
  if (params.output_srt) {
716
- const auto fname_srt = fname_outp + ".srt";
717
  output_srt(ctx, fname_srt.c_str(), params);
718
  }
719
 
720
  // output to WTS file
721
  if (params.output_wts) {
722
- const auto fname_wts = fname_outp + ".wts";
723
  output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
724
  }
725
 
726
- // output to CSV file
727
  if (params.output_csv) {
728
- const auto fname_csv = fname_outp + ".csv";
729
  output_csv(ctx, fname_csv.c_str());
730
  }
731
-
732
  }
733
  }
734
 
 
1
+ #include "common.h"
2
 
3
+ #include "whisper.h"
 
 
 
4
 
5
  #include <cmath>
6
  #include <fstream>
 
83
  std::string model = "models/ggml-base.en.bin";
84
 
85
  std::vector<std::string> fname_inp = {};
86
+ std::vector<std::string> fname_out = {};
87
  };
88
 
89
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 
123
  else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
124
  else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
125
  else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
126
+ else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
127
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
128
  else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
129
  else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
 
517
 
518
  for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
519
  const auto fname_inp = params.fname_inp[f];
520
+ const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
521
 
522
+ std::vector<float> pcmf32; // mono-channel F32 PCM
523
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
524
 
525
+ if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
526
+ fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
527
+ continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  }
529
 
530
  // print system information
 
621
 
622
  // output to text file
623
  if (params.output_txt) {
624
+ const auto fname_txt = fname_out + ".txt";
625
  output_txt(ctx, fname_txt.c_str());
626
  }
627
 
628
  // output to VTT file
629
  if (params.output_vtt) {
630
+ const auto fname_vtt = fname_out + ".vtt";
631
  output_vtt(ctx, fname_vtt.c_str());
632
  }
633
 
634
  // output to SRT file
635
  if (params.output_srt) {
636
+ const auto fname_srt = fname_out + ".srt";
637
  output_srt(ctx, fname_srt.c_str(), params);
638
  }
639
 
640
  // output to WTS file
641
  if (params.output_wts) {
642
+ const auto fname_wts = fname_out + ".wts";
643
  output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
644
  }
645
 
646
+ // output to CSV file
647
  if (params.output_csv) {
648
+ const auto fname_csv = fname_out + ".csv";
649
  output_csv(ctx, fname_csv.c_str());
650
  }
 
651
  }
652
  }
653
 
examples/stream/CMakeLists.txt CHANGED
@@ -5,6 +5,5 @@ if (WHISPER_SUPPORT_SDL2)
5
 
6
  include(DefaultTargetOptions)
7
 
8
- target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
9
- target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
10
  endif ()
 
5
 
6
  include(DefaultTargetOptions)
7
 
8
+ target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
 
9
  endif ()
examples/stream/stream.cpp CHANGED
@@ -3,19 +3,16 @@
3
  // A very quick-n-dirty implementation serving mainly as a proof of concept.
4
  //
5
 
 
 
6
  #include "whisper.h"
7
 
8
- #include <SDL.h>
9
- #include <SDL_audio.h>
10
-
11
- #include <atomic>
12
  #include <cassert>
13
  #include <cstdio>
14
  #include <string>
15
  #include <thread>
16
  #include <vector>
17
  #include <fstream>
18
- #include <mutex>
19
 
20
  // 500 -> 00:05.000
21
  // 6000 -> 01:00.000
@@ -116,306 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
116
  fprintf(stderr, "\n");
117
  }
118
 
119
- //
120
- // SDL Audio capture
121
- //
122
-
123
- class audio_async {
124
- public:
125
- audio_async(int len_ms);
126
- ~audio_async();
127
-
128
- bool init(int capture_id, int sample_rate);
129
-
130
- // start capturing audio via the provided SDL callback
131
- // keep last len_ms seconds of audio in a circular buffer
132
- bool resume();
133
- bool pause();
134
- bool clear();
135
-
136
- // callback to be called by SDL
137
- void callback(uint8_t * stream, int len);
138
-
139
- // get audio data from the circular buffer
140
- void get(int ms, std::vector<float> & audio);
141
-
142
- private:
143
- SDL_AudioDeviceID m_dev_id_in = 0;
144
-
145
- int m_len_ms = 0;
146
- int m_sample_rate = 0;
147
-
148
- std::atomic_bool m_running;
149
- std::mutex m_mutex;
150
-
151
- std::vector<float> m_audio;
152
- std::vector<float> m_audio_new;
153
- size_t m_audio_pos = 0;
154
- size_t m_audio_len = 0;
155
- };
156
-
157
- audio_async::audio_async(int len_ms) {
158
- m_len_ms = len_ms;
159
-
160
- m_running = false;
161
- }
162
-
163
- audio_async::~audio_async() {
164
- if (m_dev_id_in) {
165
- SDL_CloseAudioDevice(m_dev_id_in);
166
- }
167
- }
168
-
169
- bool audio_async::init(int capture_id, int sample_rate) {
170
- SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
171
-
172
- if (SDL_Init(SDL_INIT_AUDIO) < 0) {
173
- SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
174
- return false;
175
- }
176
-
177
- SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
178
-
179
- {
180
- int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
181
- fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
182
- for (int i = 0; i < nDevices; i++) {
183
- fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
184
- }
185
- }
186
-
187
- SDL_AudioSpec capture_spec_requested;
188
- SDL_AudioSpec capture_spec_obtained;
189
-
190
- SDL_zero(capture_spec_requested);
191
- SDL_zero(capture_spec_obtained);
192
-
193
- capture_spec_requested.freq = sample_rate;
194
- capture_spec_requested.format = AUDIO_F32;
195
- capture_spec_requested.channels = 1;
196
- capture_spec_requested.samples = 1024;
197
- capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
198
- audio_async * audio = (audio_async *) userdata;
199
- audio->callback(stream, len);
200
- };
201
- capture_spec_requested.userdata = this;
202
-
203
- if (capture_id >= 0) {
204
- fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
205
- m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
206
- } else {
207
- fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
208
- m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
209
- }
210
-
211
- if (!m_dev_id_in) {
212
- fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
213
- m_dev_id_in = 0;
214
-
215
- return false;
216
- } else {
217
- fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
218
- fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
219
- fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format,
220
- capture_spec_requested.format);
221
- fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels,
222
- capture_spec_requested.channels);
223
- fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
224
- }
225
-
226
- m_sample_rate = capture_spec_obtained.freq;
227
-
228
- m_audio.resize((m_sample_rate*m_len_ms)/1000);
229
-
230
- return true;
231
- }
232
-
233
- bool audio_async::resume() {
234
- if (!m_dev_id_in) {
235
- fprintf(stderr, "%s: no audio device to resume!\n", __func__);
236
- return false;
237
- }
238
-
239
- if (m_running) {
240
- fprintf(stderr, "%s: already running!\n", __func__);
241
- return false;
242
- }
243
-
244
- SDL_PauseAudioDevice(m_dev_id_in, 0);
245
-
246
- m_running = true;
247
-
248
- return true;
249
- }
250
-
251
- bool audio_async::pause() {
252
- if (!m_dev_id_in) {
253
- fprintf(stderr, "%s: no audio device to pause!\n", __func__);
254
- return false;
255
- }
256
-
257
- if (!m_running) {
258
- fprintf(stderr, "%s: already paused!\n", __func__);
259
- return false;
260
- }
261
-
262
- SDL_PauseAudioDevice(m_dev_id_in, 1);
263
-
264
- m_running = false;
265
-
266
- return true;
267
- }
268
-
269
- bool audio_async::clear() {
270
- if (!m_dev_id_in) {
271
- fprintf(stderr, "%s: no audio device to clear!\n", __func__);
272
- return false;
273
- }
274
-
275
- if (!m_running) {
276
- fprintf(stderr, "%s: not running!\n", __func__);
277
- return false;
278
- }
279
-
280
- {
281
- std::lock_guard<std::mutex> lock(m_mutex);
282
-
283
- m_audio_pos = 0;
284
- m_audio_len = 0;
285
- }
286
-
287
- return true;
288
- }
289
-
290
- // callback to be called by SDL
291
- void audio_async::callback(uint8_t * stream, int len) {
292
- if (!m_running) {
293
- return;
294
- }
295
-
296
- const size_t n_samples = len / sizeof(float);
297
-
298
- m_audio_new.resize(n_samples);
299
- memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
300
-
301
- //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
302
-
303
- {
304
- std::lock_guard<std::mutex> lock(m_mutex);
305
-
306
- if (m_audio_pos + n_samples > m_audio.size()) {
307
- const size_t n0 = m_audio.size() - m_audio_pos;
308
-
309
- memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
310
- memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
311
-
312
- m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
313
- m_audio_len = m_audio.size();
314
- } else {
315
- memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
316
-
317
- m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
318
- m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
319
- }
320
- }
321
- }
322
-
323
- void audio_async::get(int ms, std::vector<float> & result) {
324
- if (!m_dev_id_in) {
325
- fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
326
- return;
327
- }
328
-
329
- if (!m_running) {
330
- fprintf(stderr, "%s: not running!\n", __func__);
331
- return;
332
- }
333
-
334
- result.clear();
335
-
336
- {
337
- std::lock_guard<std::mutex> lock(m_mutex);
338
-
339
- if (ms <= 0) {
340
- ms = m_len_ms;
341
- }
342
-
343
- size_t n_samples = (m_sample_rate * ms) / 1000;
344
- if (n_samples > m_audio_len) {
345
- n_samples = m_audio_len;
346
- }
347
-
348
- result.resize(n_samples);
349
-
350
- int s0 = m_audio_pos - n_samples;
351
- if (s0 < 0) {
352
- s0 += m_audio.size();
353
- }
354
-
355
- if (s0 + n_samples > m_audio.size()) {
356
- const size_t n0 = m_audio.size() - s0;
357
-
358
- memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
359
- memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
360
- } else {
361
- memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
362
- }
363
- }
364
- }
365
-
366
- ///////////////////////////
367
-
368
- void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
369
- const float rc = 1.0f / (2.0f * M_PI * cutoff);
370
- const float dt = 1.0f / sample_rate;
371
- const float alpha = dt / (rc + dt);
372
-
373
- float y = data[0];
374
-
375
- for (size_t i = 1; i < data.size(); i++) {
376
- y = alpha * (y + data[i] - data[i - 1]);
377
- data[i] = y;
378
- }
379
- }
380
-
381
- bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
382
- const int n_samples = pcmf32.size();
383
- const int n_samples_last = (sample_rate * last_ms) / 1000;
384
-
385
- if (n_samples_last >= n_samples) {
386
- // not enough samples - assume no speech
387
- return false;
388
- }
389
-
390
- if (freq_thold > 0.0f) {
391
- high_pass_filter(pcmf32, freq_thold, sample_rate);
392
- }
393
-
394
- float energy_all = 0.0f;
395
- float energy_last = 0.0f;
396
-
397
- for (int i = 0; i < n_samples; i++) {
398
- energy_all += fabsf(pcmf32[i]);
399
-
400
- if (i >= n_samples - n_samples_last) {
401
- energy_last += fabsf(pcmf32[i]);
402
- }
403
- }
404
-
405
- energy_all /= n_samples;
406
- energy_last /= n_samples_last;
407
-
408
- if (verbose) {
409
- fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
410
- }
411
-
412
- if (energy_last > vad_thold*energy_all) {
413
- return false;
414
- }
415
-
416
- return true;
417
- }
418
-
419
  int main(int argc, char ** argv) {
420
  whisper_params params;
421
 
@@ -426,10 +123,10 @@ int main(int argc, char ** argv) {
426
  params.keep_ms = std::min(params.keep_ms, params.step_ms);
427
  params.length_ms = std::max(params.length_ms, params.step_ms);
428
 
429
- const int n_samples_step = (params.step_ms *1e-3)*WHISPER_SAMPLE_RATE;
430
- const int n_samples_len = (params.length_ms*1e-3)*WHISPER_SAMPLE_RATE;
431
- const int n_samples_keep = (params.keep_ms *1e-3)*WHISPER_SAMPLE_RATE;
432
- const int n_samples_30s = (30000 *1e-3)*WHISPER_SAMPLE_RATE;
433
 
434
  const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
435
 
@@ -517,23 +214,7 @@ int main(int argc, char ** argv) {
517
  // main audio loop
518
  while (is_running) {
519
  // handle Ctrl + C
520
- {
521
- SDL_Event event;
522
- while (SDL_PollEvent(&event)) {
523
- switch (event.type) {
524
- case SDL_QUIT:
525
- {
526
- is_running = false;
527
- } break;
528
- default:
529
- break;
530
- }
531
- }
532
-
533
- if (!is_running) {
534
- break;
535
- }
536
- }
537
 
538
  if (!is_running) {
539
  break;
@@ -556,7 +237,7 @@ int main(int argc, char ** argv) {
556
  break;
557
  }
558
 
559
- SDL_Delay(1);
560
  }
561
 
562
  const int n_samples_new = pcmf32_new.size();
@@ -587,7 +268,7 @@ int main(int argc, char ** argv) {
587
 
588
  audio.get(2000, pcmf32_new);
589
 
590
- if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
591
  audio.get(params.length_ms, pcmf32);
592
  } else {
593
  std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
3
  // A very quick-n-dirty implementation serving mainly as a proof of concept.
4
  //
5
 
6
+ #include "common.h"
7
+ #include "common-sdl.h"
8
  #include "whisper.h"
9
 
 
 
 
 
10
  #include <cassert>
11
  #include <cstdio>
12
  #include <string>
13
  #include <thread>
14
  #include <vector>
15
  #include <fstream>
 
16
 
17
  // 500 -> 00:05.000
18
  // 6000 -> 01:00.000
 
113
  fprintf(stderr, "\n");
114
  }
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  int main(int argc, char ** argv) {
117
  whisper_params params;
118
 
 
123
  params.keep_ms = std::min(params.keep_ms, params.step_ms);
124
  params.length_ms = std::max(params.length_ms, params.step_ms);
125
 
126
+ const int n_samples_step = (1e-3*params.step_ms )*WHISPER_SAMPLE_RATE;
127
+ const int n_samples_len = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
128
+ const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE;
129
+ const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE;
130
 
131
  const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
132
 
 
214
  // main audio loop
215
  while (is_running) {
216
  // handle Ctrl + C
217
+ is_running = sdl_poll_events();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  if (!is_running) {
220
  break;
 
237
  break;
238
  }
239
 
240
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
241
  }
242
 
243
  const int n_samples_new = pcmf32_new.size();
 
268
 
269
  audio.get(2000, pcmf32_new);
270
 
271
+ if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
272
  audio.get(params.length_ms, pcmf32);
273
  } else {
274
  std::this_thread::sleep_for(std::chrono::milliseconds(100));
examples/talk/CMakeLists.txt CHANGED
@@ -7,7 +7,7 @@ if (WHISPER_SUPPORT_SDL2)
7
 
8
  # TODO: this is temporary
9
  # need to export ggml symbols for MSVC, but too lazy ..
10
- add_executable(${TARGET} talk.cpp gpt-2.cpp ../../ggml.c ../../whisper.cpp)
11
 
12
  include(DefaultTargetOptions)
13
 
 
7
 
8
  # TODO: this is temporary
9
  # need to export ggml symbols for MSVC, but too lazy ..
10
+ add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
11
 
12
  include(DefaultTargetOptions)
13
 
examples/talk/talk.cpp CHANGED
@@ -1,16 +1,14 @@
1
  // Talk with AI
2
  //
3
 
 
 
4
  #include "whisper.h"
5
  #include "gpt-2.h"
6
 
7
- #include <SDL.h>
8
- #include <SDL_audio.h>
9
-
10
  #include <cassert>
11
  #include <cstdio>
12
  #include <fstream>
13
- #include <mutex>
14
  #include <regex>
15
  #include <string>
16
  #include <thread>
@@ -105,320 +103,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
105
  fprintf(stderr, "\n");
106
  }
107
 
108
- //
109
- // SDL Audio capture
110
- //
111
-
112
- class audio_async {
113
- public:
114
- audio_async(int len_ms);
115
- ~audio_async();
116
-
117
- bool init(int capture_id, int sample_rate);
118
-
119
- // start capturing audio via the provided SDL callback
120
- // keep last len_ms seconds of audio in a circular buffer
121
- bool resume();
122
- bool pause();
123
- bool clear();
124
-
125
- // callback to be called by SDL
126
- void callback(uint8_t * stream, int len);
127
-
128
- // get audio data from the circular buffer
129
- void get(int ms, std::vector<float> & audio);
130
-
131
- private:
132
- SDL_AudioDeviceID m_dev_id_in = 0;
133
-
134
- int m_len_ms = 0;
135
- int m_sample_rate = 0;
136
-
137
- bool m_running = false;
138
- std::mutex m_mutex;
139
-
140
- std::vector<float> m_audio;
141
- std::vector<float> m_audio_new;
142
- size_t m_audio_pos = 0;
143
- size_t m_audio_len = 0;
144
- };
145
-
146
- audio_async::audio_async(int len_ms) {
147
- m_len_ms = len_ms;
148
- }
149
-
150
- audio_async::~audio_async() {
151
- if (m_dev_id_in) {
152
- SDL_CloseAudioDevice(m_dev_id_in);
153
- }
154
- }
155
-
156
- bool audio_async::init(int capture_id, int sample_rate) {
157
- SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
158
-
159
- if (SDL_Init(SDL_INIT_AUDIO) < 0) {
160
- SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
161
- return false;
162
- }
163
-
164
- SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
165
-
166
- {
167
- int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
168
- fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
169
- for (int i = 0; i < nDevices; i++) {
170
- fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
171
- }
172
- }
173
-
174
- SDL_AudioSpec capture_spec_requested;
175
- SDL_AudioSpec capture_spec_obtained;
176
-
177
- SDL_zero(capture_spec_requested);
178
- SDL_zero(capture_spec_obtained);
179
-
180
- capture_spec_requested.freq = sample_rate;
181
- capture_spec_requested.format = AUDIO_F32;
182
- capture_spec_requested.channels = 1;
183
- capture_spec_requested.samples = 1024;
184
- capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
185
- audio_async * audio = (audio_async *) userdata;
186
- audio->callback(stream, len);
187
- };
188
- capture_spec_requested.userdata = this;
189
-
190
- if (capture_id >= 0) {
191
- fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
192
- m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
193
- } else {
194
- fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
195
- m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
196
- }
197
-
198
- if (!m_dev_id_in) {
199
- fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
200
- m_dev_id_in = 0;
201
-
202
- return false;
203
- } else {
204
- fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
205
- fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
206
- fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format,
207
- capture_spec_requested.format);
208
- fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels,
209
- capture_spec_requested.channels);
210
- fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
211
- fprintf(stderr, "\n");
212
- }
213
-
214
- m_sample_rate = capture_spec_obtained.freq;
215
-
216
- m_audio.resize((m_sample_rate*m_len_ms)/1000);
217
-
218
- return true;
219
- }
220
-
221
- bool audio_async::resume() {
222
- if (!m_dev_id_in) {
223
- fprintf(stderr, "%s: no audio device to resume!\n", __func__);
224
- return false;
225
- }
226
-
227
- if (m_running) {
228
- fprintf(stderr, "%s: already running!\n", __func__);
229
- return false;
230
- }
231
-
232
- SDL_PauseAudioDevice(m_dev_id_in, 0);
233
-
234
- m_running = true;
235
-
236
- return true;
237
- }
238
-
239
- bool audio_async::pause() {
240
- if (!m_dev_id_in) {
241
- fprintf(stderr, "%s: no audio device to pause!\n", __func__);
242
- return false;
243
- }
244
-
245
- if (!m_running) {
246
- fprintf(stderr, "%s: already paused!\n", __func__);
247
- return false;
248
- }
249
-
250
- SDL_PauseAudioDevice(m_dev_id_in, 1);
251
-
252
- m_running = false;
253
-
254
- return true;
255
- }
256
-
257
- bool audio_async::clear() {
258
- if (!m_dev_id_in) {
259
- fprintf(stderr, "%s: no audio device to clear!\n", __func__);
260
- return false;
261
- }
262
-
263
- if (!m_running) {
264
- fprintf(stderr, "%s: not running!\n", __func__);
265
- return false;
266
- }
267
-
268
- {
269
- std::lock_guard<std::mutex> lock(m_mutex);
270
-
271
- m_audio_pos = 0;
272
- m_audio_len = 0;
273
- }
274
-
275
- return true;
276
- }
277
-
278
- // callback to be called by SDL
279
- void audio_async::callback(uint8_t * stream, int len) {
280
- if (!m_running) {
281
- return;
282
- }
283
-
284
- const size_t n_samples = len / sizeof(float);
285
-
286
- m_audio_new.resize(n_samples);
287
- memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
288
-
289
- //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
290
-
291
- {
292
- std::lock_guard<std::mutex> lock(m_mutex);
293
-
294
- if (m_audio_pos + n_samples > m_audio.size()) {
295
- const size_t n0 = m_audio.size() - m_audio_pos;
296
-
297
- memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
298
- memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
299
-
300
- m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
301
- m_audio_len = m_audio.size();
302
- } else {
303
- memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
304
-
305
- m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
306
- m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
307
- }
308
- }
309
- }
310
-
311
- void audio_async::get(int ms, std::vector<float> & result) {
312
- if (!m_dev_id_in) {
313
- fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
314
- return;
315
- }
316
-
317
- if (!m_running) {
318
- fprintf(stderr, "%s: not running!\n", __func__);
319
- return;
320
- }
321
-
322
- result.clear();
323
-
324
- {
325
- std::lock_guard<std::mutex> lock(m_mutex);
326
-
327
- if (ms <= 0) {
328
- ms = m_len_ms;
329
- }
330
-
331
- size_t n_samples = (m_sample_rate * ms) / 1000;
332
- if (n_samples > m_audio_len) {
333
- n_samples = m_audio_len;
334
- }
335
-
336
- result.resize(n_samples);
337
-
338
- int s0 = m_audio_pos - n_samples;
339
- if (s0 < 0) {
340
- s0 += m_audio.size();
341
- }
342
-
343
- if (s0 + n_samples > m_audio.size()) {
344
- const size_t n0 = m_audio.size() - s0;
345
-
346
- memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
347
- memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
348
- } else {
349
- memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
350
- }
351
- }
352
- }
353
-
354
- ///////////////////////////
355
-
356
- std::string trim(const std::string & s) {
357
- std::regex e("^\\s+|\\s+$");
358
- return std::regex_replace(s, e, "");
359
- }
360
-
361
- std::string replace(const std::string & s, const std::string & from, const std::string & to) {
362
- std::string result = s;
363
- size_t pos = 0;
364
- while ((pos = result.find(from, pos)) != std::string::npos) {
365
- result.replace(pos, from.length(), to);
366
- pos += to.length();
367
- }
368
- return result;
369
- }
370
-
371
- void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
372
- const float rc = 1.0f / (2.0f * M_PI * cutoff);
373
- const float dt = 1.0f / sample_rate;
374
- const float alpha = dt / (rc + dt);
375
-
376
- float y = data[0];
377
-
378
- for (size_t i = 1; i < data.size(); i++) {
379
- y = alpha * (y + data[i] - data[i - 1]);
380
- data[i] = y;
381
- }
382
- }
383
-
384
- bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
385
- const int n_samples = pcmf32.size();
386
- const int n_samples_last = (sample_rate * last_ms) / 1000;
387
-
388
- if (n_samples_last >= n_samples) {
389
- // not enough samples - assume no speech
390
- return false;
391
- }
392
-
393
- if (freq_thold > 0.0f) {
394
- high_pass_filter(pcmf32, freq_thold, sample_rate);
395
- }
396
-
397
- float energy_all = 0.0f;
398
- float energy_last = 0.0f;
399
-
400
- for (int i = 0; i < n_samples; i++) {
401
- energy_all += fabsf(pcmf32[i]);
402
-
403
- if (i >= n_samples - n_samples_last) {
404
- energy_last += fabsf(pcmf32[i]);
405
- }
406
- }
407
-
408
- energy_all /= n_samples;
409
- energy_last /= n_samples_last;
410
-
411
- if (verbose) {
412
- fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
413
- }
414
-
415
- if (energy_last > vad_thold*energy_all) {
416
- return false;
417
- }
418
-
419
- return true;
420
- }
421
-
422
  std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
423
  const auto t_start = std::chrono::high_resolution_clock::now();
424
 
@@ -557,22 +241,10 @@ int main(int argc, char ** argv) {
557
  // main loop
558
  while (is_running) {
559
  // handle Ctrl + C
560
- {
561
- SDL_Event event;
562
- while (SDL_PollEvent(&event)) {
563
- switch (event.type) {
564
- case SDL_QUIT:
565
- {
566
- is_running = false;
567
- } break;
568
- default:
569
- break;
570
- }
571
- }
572
 
573
- if (!is_running) {
574
- break;
575
- }
576
  }
577
 
578
  // delay
@@ -583,7 +255,7 @@ int main(int argc, char ** argv) {
583
  {
584
  audio.get(2000, pcmf32_cur);
585
 
586
- if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
587
  fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
588
 
589
  audio.get(params.voice_ms, pcmf32_cur);
 
1
  // Talk with AI
2
  //
3
 
4
+ #include "common.h"
5
+ #include "common-sdl.h"
6
  #include "whisper.h"
7
  #include "gpt-2.h"
8
 
 
 
 
9
  #include <cassert>
10
  #include <cstdio>
11
  #include <fstream>
 
12
  #include <regex>
13
  #include <string>
14
  #include <thread>
 
103
  fprintf(stderr, "\n");
104
  }
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
107
  const auto t_start = std::chrono::high_resolution_clock::now();
108
 
 
241
  // main loop
242
  while (is_running) {
243
  // handle Ctrl + C
244
+ is_running = sdl_poll_events();
 
 
 
 
 
 
 
 
 
 
 
245
 
246
+ if (!is_running) {
247
+ break;
 
248
  }
249
 
250
  // delay
 
255
  {
256
  audio.get(2000, pcmf32_cur);
257
 
258
+ if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
259
  fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
260
 
261
  audio.get(params.voice_ms, pcmf32_cur);