whisper.cpp

Running

App Files Files Community

ggerganov commited on Feb 15, 2023

Commit

acbd6f7

unverified ·

1 Parent(s): 560e81f

examples : refactor in order to reuse code and reduce duplication (#482)

Browse files

* examples : refactor common code into a library

* examples : refactor common SDL code into a library

* make : update Makefile to use common libs

* common : fix MSVC M_PI ..

* addon.node : link common lib

Files changed (19) hide show

Makefile +11 -8
bindings/javascript/whisper.js +0 -0
examples/CMakeLists.txt +31 -0
examples/addon.node/CMakeLists.txt +1 -1
examples/addon.node/addon.cpp +11 -91
examples/command.wasm/CMakeLists.txt +1 -0
examples/command.wasm/emscripten.cpp +3 -59
examples/command/CMakeLists.txt +1 -2
examples/command/command.cpp +9 -330
examples/common-sdl.cpp +226 -0
examples/common-sdl.h +50 -0
examples/common.cpp +162 -0
examples/common.h +40 -0
examples/main/CMakeLists.txt +1 -1
examples/main/main.cpp +15 -96
examples/stream/CMakeLists.txt +1 -2
examples/stream/stream.cpp +9 -328
examples/talk/CMakeLists.txt +1 -1
examples/talk/talk.cpp +6 -334

Makefile CHANGED Viewed

@@ -197,18 +197,21 @@ clean:
 CC_SDL=`sdl2-config --cflags --libs`
-main: examples/main/main.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o whisper.o -o main $(LDFLAGS)
 	./main -h
-stream: examples/stream/stream.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
-command: examples/command/command.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
-talk: examples/talk/talk.cpp  examples/talk/gpt-2.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
 bench: examples/bench/bench.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)

 CC_SDL=`sdl2-config --cflags --libs`
+SRC_COMMON = examples/common.cpp
+SRC_COMMON_SDL = examples/common-sdl.cpp
+main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
 	./main -h
+stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
+command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
 bench: examples/bench/bench.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)

bindings/javascript/whisper.js CHANGED Viewed

The diff for this file is too large to render. See raw diff

examples/CMakeLists.txt CHANGED Viewed

@@ -14,6 +14,37 @@ if (WHISPER_SUPPORT_SDL2)
     message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
 endif()
 # examples
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})

     message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
 endif()
+# common
+set(TARGET common)
+add_library(${TARGET} STATIC
+    common.h
+    common.cpp
+    )
+include(DefaultTargetOptions)
+set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+if (WHISPER_SUPPORT_SDL2)
+    # common-sdl
+    set(TARGET common-sdl)
+    add_library(${TARGET} STATIC
+        common-sdl.h
+        common-sdl.cpp
+        )
+    include(DefaultTargetOptions)
+    target_include_directories(${TARGET} PUBLIC ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES})
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
 # examples
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})

examples/addon.node/CMakeLists.txt CHANGED Viewed

@@ -23,7 +23,7 @@ string(REPLACE "\"" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR})
 target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
 #==================================================================
-target_link_libraries(${TARGET} ${CMAKE_JS_LIB} whisper ${CMAKE_THREAD_LIBS_INIT})
 if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
     # Generate node.lib

 target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
 #==================================================================
+target_link_libraries(${TARGET} ${CMAKE_JS_LIB} common whisper ${CMAKE_THREAD_LIBS_INIT})
 if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
     # Generate node.lib

examples/addon.node/addon.cpp CHANGED Viewed

@@ -1,15 +1,13 @@
-#include <cstdint>
 #include <string>
 #include <thread>
 #include <vector>
 #include <cmath>
-#include "napi.h"
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-#include "whisper.h"
 struct whisper_params {
     int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
@@ -44,7 +42,7 @@ struct whisper_params {
     std::string model    = "../../ggml-large.bin";
     std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_outp = {};
 };
 struct whisper_print_user_data {
@@ -143,7 +141,6 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
 }
 int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
     if (params.fname_inp.empty()) {
         fprintf(stderr, "error: no input files specified\n");
         return 2;
@@ -181,91 +178,14 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
     for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
         const auto fname_inp = params.fname_inp[f];
-        const auto fname_outp = f < (int)params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
         std::vector<float> pcmf32; // mono-channel F32 PCM
         std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-        // WAV input
-        {
-            drwav wav;
-            std::vector<uint8_t> wav_data; // used for pipe input from stdin
-            if (fname_inp == "-") {
-                {
-                    uint8_t buf[1024];
-                    while (true)
-                    {
-                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
-                        if (n == 0) {
-                            break;
-                        }
-                        wav_data.insert(wav_data.end(), buf, buf + n);
-                    }
-                }
-                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
-                    return 4;
-                }
-                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
-            }
-            else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
-                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
-                return 5;
-            }
-           if (wav.channels != 1 && wav.channels != 2) {
-               fprintf(stderr, "error: WAV file '%s' must be mono or stereo\n", fname_inp.c_str());
-               return 6;
-           }
-           if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
-               fprintf(stderr, "error: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str());
-               return 6;
-           }
-           if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
-               fprintf(stderr, "error: WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
-               return 8;
-           }
-           if (wav.bitsPerSample != 16) {
-               fprintf(stderr, "error: WAV file '%s' must be 16-bit\n", fname_inp.c_str());
-               return 9;
-           }
-            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
-            std::vector<int16_t> pcm16;
-            pcm16.resize(n*wav.channels);
-            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-            drwav_uninit(&wav);
-            // convert to mono, float
-            pcmf32.resize(n);
-            if (wav.channels == 1) {
-                for (uint64_t i = 0; i < n; i++) {
-                    pcmf32[i] = float(pcm16[i])/32768.0f;
-                }
-            } else {
-                for (uint64_t i = 0; i < n; i++) {
-                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
-                }
-            }
-            if (params.diarize) {
-                // convert to stereo, float
-                pcmf32s.resize(2);
-                pcmf32s[0].resize(n);
-                pcmf32s[1].resize(n);
-                for (uint64_t i = 0; i < n; i++) {
-                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
-                }
-            }
         }
         // print system information

+#include "napi.h"
+#include "common.h"
+#include "whisper.h"
 #include <string>
 #include <thread>
 #include <vector>
 #include <cmath>
+#include <cstdint>
 struct whisper_params {
     int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
     std::string model    = "../../ggml-large.bin";
     std::vector<std::string> fname_inp = {};
+    std::vector<std::string> fname_out = {};
 };
 struct whisper_print_user_data {
 }
 int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
     if (params.fname_inp.empty()) {
         fprintf(stderr, "error: no input files specified\n");
         return 2;
     for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
         const auto fname_inp = params.fname_inp[f];
+        const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
         std::vector<float> pcmf32; // mono-channel F32 PCM
         std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
+            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
+            continue;
         }
         // print system information

examples/command.wasm/CMakeLists.txt CHANGED Viewed

@@ -11,6 +11,7 @@ add_executable(${TARGET}
 include(DefaultTargetOptions)
 target_link_libraries(${TARGET} PRIVATE
     whisper
     )

 include(DefaultTargetOptions)
 target_link_libraries(${TARGET} PRIVATE
+    common
     whisper
     )

examples/command.wasm/emscripten.cpp CHANGED Viewed

@@ -1,4 +1,5 @@
 #include "ggml.h"
 #include "whisper.h"
 #include <emscripten.h>
@@ -27,24 +28,6 @@ std::string g_transcribed   = "";
 std::vector<float> g_pcmf32;
-static std::string trim(const std::string & s) {
-    std::regex e("^\\s+|\\s+$");
-    return std::regex_replace(s, e, "");
-}
-static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
-    const float rc = 1.0f / (2.0f * M_PI * cutoff);
-    const float dt = 1.0f / sample_rate;
-    const float alpha = dt / (rc + dt);
-    float y = data[0];
-    for (size_t i = 1; i < data.size(); i++) {
-        y = alpha * (y + data[i] - data[i - 1]);
-        data[i] = y;
-    }
-}
 // compute similarity between two strings using Levenshtein distance
 static float similarity(const std::string & s0, const std::string & s1) {
     const size_t len0 = s0.size() + 1;
@@ -75,44 +58,6 @@ void command_set_status(const std::string & status) {
     g_status = status;
 }
-bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
-    const int n_samples      = pcmf32.size();
-    const int n_samples_last = (sample_rate * last_ms) / 1000;
-    if (n_samples_last >= n_samples) {
-        // not enough samples - assume no speech
-        return false;
-    }
-    if (freq_thold > 0.0f) {
-        high_pass_filter(pcmf32, freq_thold, sample_rate);
-    }
-    float energy_all  = 0.0f;
-    float energy_last = 0.0f;
-    for (size_t i = 0; i < n_samples; i++) {
-        energy_all += fabsf(pcmf32[i]);
-        if (i >= n_samples - n_samples_last) {
-            energy_last += fabsf(pcmf32[i]);
-        }
-    }
-    energy_all  /= n_samples;
-    energy_last /= n_samples_last;
-    if (verbose) {
-        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
-    }
-    if (energy_last > vad_thold*energy_all) {
-        return false;
-    }
-    return true;
-}
 std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
     const auto t_start = std::chrono::high_resolution_clock::now();
@@ -155,7 +100,7 @@ void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
     const int64_t n_samples = (ms * sample_rate) / 1000;
     int64_t n_take = 0;
-    if (g_pcmf32.size() < n_samples) {
         n_take = g_pcmf32.size();
     } else {
         n_take = n_samples;
@@ -187,7 +132,6 @@ void command_main(size_t index) {
     printf("command: using %d threads\n", wparams.n_threads);
-    bool is_running   = true;
     bool have_prompt  = false;
     bool ask_prompt   = true;
     bool print_energy = false;
@@ -233,7 +177,7 @@ void command_main(size_t index) {
         {
             command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
-            if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
                 fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                 command_set_status("Speech detected! Processing ...");

 #include "ggml.h"
+#include "common.h"
 #include "whisper.h"
 #include <emscripten.h>
 std::vector<float> g_pcmf32;
 // compute similarity between two strings using Levenshtein distance
 static float similarity(const std::string & s0, const std::string & s1) {
     const size_t len0 = s0.size() + 1;
     g_status = status;
 }
 std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
     const auto t_start = std::chrono::high_resolution_clock::now();
     const int64_t n_samples = (ms * sample_rate) / 1000;
     int64_t n_take = 0;
+    if (n_samples > (int) g_pcmf32.size()) {
         n_take = g_pcmf32.size();
     } else {
         n_take = n_samples;
     printf("command: using %d threads\n", wparams.n_threads);
     bool have_prompt  = false;
     bool ask_prompt   = true;
     bool print_energy = false;
         {
             command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
+            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
                 fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                 command_set_status("Speech detected! Processing ...");

examples/command/CMakeLists.txt CHANGED Viewed

@@ -5,6 +5,5 @@ if (WHISPER_SUPPORT_SDL2)
     include(DefaultTargetOptions)
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
-    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()

     include(DefaultTargetOptions)
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
 endif ()

examples/command/command.cpp CHANGED Viewed

@@ -6,11 +6,10 @@
 // ref: https://github.com/ggerganov/whisper.cpp/issues/171
 //
 #include "whisper.h"
-#include <SDL.h>
-#include <SDL_audio.h>
 #include <sstream>
 #include <cassert>
 #include <cstdio>
@@ -110,309 +109,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "\n");
 }
-//
-// SDL Audio capture
-//
-class audio_async {
-public:
-    audio_async(int len_ms);
-    ~audio_async();
-    bool init(int capture_id, int sample_rate);
-    // start capturing audio via the provided SDL callback
-    // keep last len_ms seconds of audio in a circular buffer
-    bool resume();
-    bool pause();
-    bool clear();
-    // callback to be called by SDL
-    void callback(uint8_t * stream, int len);
-    // get audio data from the circular buffer
-    void get(int ms, std::vector<float> & audio);
-private:
-    SDL_AudioDeviceID m_dev_id_in = 0;
-    int m_len_ms = 0;
-    int m_sample_rate = 0;
-    bool       m_running = false;
-    std::mutex m_mutex;
-    std::vector<float> m_audio;
-    std::vector<float> m_audio_new;
-    size_t             m_audio_pos = 0;
-    size_t             m_audio_len = 0;
-};
-audio_async::audio_async(int len_ms) {
-    m_len_ms = len_ms;
-}
-audio_async::~audio_async() {
-    if (m_dev_id_in) {
-        SDL_CloseAudioDevice(m_dev_id_in);
-    }
-}
-bool audio_async::init(int capture_id, int sample_rate) {
-    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
-    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
-        return false;
-    }
-    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
-    {
-        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
-        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
-        for (int i = 0; i < nDevices; i++) {
-            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
-        }
-    }
-    SDL_AudioSpec capture_spec_requested;
-    SDL_AudioSpec capture_spec_obtained;
-    SDL_zero(capture_spec_requested);
-    SDL_zero(capture_spec_obtained);
-    capture_spec_requested.freq     = sample_rate;
-    capture_spec_requested.format   = AUDIO_F32;
-    capture_spec_requested.channels = 1;
-    capture_spec_requested.samples  = 1024;
-    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
-        audio_async * audio = (audio_async *) userdata;
-        audio->callback(stream, len);
-    };
-    capture_spec_requested.userdata = this;
-    if (capture_id >= 0) {
-        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
-        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    } else {
-        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
-        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    }
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
-        m_dev_id_in = 0;
-        return false;
-    } else {
-        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
-        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
-        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
-                capture_spec_requested.format);
-        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
-                capture_spec_requested.channels);
-        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
-    }
-    m_sample_rate = capture_spec_obtained.freq;
-    m_audio.resize((m_sample_rate*m_len_ms)/1000);
-    return true;
-}
-bool audio_async::resume() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
-        return false;
-    }
-    if (m_running) {
-        fprintf(stderr, "%s: already running!\n", __func__);
-        return false;
-    }
-    SDL_PauseAudioDevice(m_dev_id_in, 0);
-    m_running = true;
-    return true;
-}
-bool audio_async::pause() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
-        return false;
-    }
-    if (!m_running) {
-        fprintf(stderr, "%s: already paused!\n", __func__);
-        return false;
-    }
-    SDL_PauseAudioDevice(m_dev_id_in, 1);
-    m_running = false;
-    return true;
-}
-bool audio_async::clear() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
-        return false;
-    }
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return false;
-    }
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        m_audio_pos = 0;
-        m_audio_len = 0;
-    }
-    return true;
-}
-// callback to be called by SDL
-void audio_async::callback(uint8_t * stream, int len) {
-    if (!m_running) {
-        return;
-    }
-    const size_t n_samples = len / sizeof(float);
-    m_audio_new.resize(n_samples);
-    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
-    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        if (m_audio_pos + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - m_audio_pos;
-            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
-            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = m_audio.size();
-        } else {
-            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
-        }
-    }
-}
-void audio_async::get(int ms, std::vector<float> & result) {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
-        return;
-    }
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return;
-    }
-    result.clear();
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        if (ms <= 0) {
-            ms = m_len_ms;
-        }
-        size_t n_samples = (m_sample_rate * ms) / 1000;
-        if (n_samples > m_audio_len) {
-            n_samples = m_audio_len;
-        }
-        result.resize(n_samples);
-        int s0 = m_audio_pos - n_samples;
-        if (s0 < 0) {
-            s0 += m_audio.size();
-        }
-        if (s0 + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - s0;
-            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
-            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
-        } else {
-            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
-        }
-    }
-}
-///////////////////////////
-std::string trim(const std::string & s) {
-    std::regex e("^\\s+|\\s+$");
-    return std::regex_replace(s, e, "");
-}
-void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
-    const float rc = 1.0f / (2.0f * M_PI * cutoff);
-    const float dt = 1.0f / sample_rate;
-    const float alpha = dt / (rc + dt);
-    float y = data[0];
-    for (size_t i = 1; i < data.size(); i++) {
-        y = alpha * (y + data[i] - data[i - 1]);
-        data[i] = y;
-    }
-}
-bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
-    const int n_samples      = pcmf32.size();
-    const int n_samples_last = (sample_rate * last_ms) / 1000;
-    if (n_samples_last >= n_samples) {
-        // not enough samples - assume no speech
-        return false;
-    }
-    if (freq_thold > 0.0f) {
-        high_pass_filter(pcmf32, freq_thold, sample_rate);
-    }
-    float energy_all  = 0.0f;
-    float energy_last = 0.0f;
-    for (int i = 0; i < n_samples; i++) {
-        energy_all += fabsf(pcmf32[i]);
-        if (i >= n_samples - n_samples_last) {
-            energy_last += fabsf(pcmf32[i]);
-        }
-    }
-    energy_all  /= n_samples;
-    energy_last /= n_samples_last;
-    if (verbose) {
-        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
-    }
-    if (energy_last > vad_thold*energy_all) {
-        return false;
-    }
-    return true;
-}
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
     const auto t_start = std::chrono::high_resolution_clock::now();
@@ -502,7 +198,7 @@ std::vector<std::string> read_allowed_commands(const std::string & fname) {
     std::string line;
     while (std::getline(ifs, line)) {
-        line = trim(line);
         if (line.empty()) {
             continue;
         }
@@ -526,23 +222,6 @@ std::vector<std::string> get_words(const std::string &txt) {
     return words;
 }
-// returns true if no exit event was received
-bool process_sdl_events() {
-    SDL_Event event;
-    while (SDL_PollEvent(&event)) {
-        switch (event.type) {
-            case SDL_QUIT:
-                {
-                    return false;
-                } break;
-            default:
-                break;
-        }
-    }
-    return true;
-}
 // command-list mode
 // guide the transcription to match the most likely command from a provided list
 int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
@@ -634,14 +313,14 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
     // main loop
     while (is_running) {
         // handle Ctrl + C
-        is_running = process_sdl_events();
         // delay
         std::this_thread::sleep_for(std::chrono::milliseconds(100));
         audio.get(2000, pcmf32_cur);
-        if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
             fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
             const auto t_start = std::chrono::high_resolution_clock::now();
@@ -775,7 +454,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
     // main loop
     while (is_running) {
         // handle Ctrl + C
-        is_running = process_sdl_events();
         // delay
         std::this_thread::sleep_for(std::chrono::milliseconds(100));
@@ -791,7 +470,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
         {
             audio.get(2000, pcmf32_cur);
-            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                 fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                 int64_t t_ms = 0;
@@ -854,7 +533,7 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
     // main loop
     while (is_running) {
         // handle Ctrl + C
-        is_running = process_sdl_events();
         // delay
         std::this_thread::sleep_for(std::chrono::milliseconds(100));
@@ -870,7 +549,7 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
         {
             audio.get(2000, pcmf32_cur);
-            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                 fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                 int64_t t_ms = 0;

 // ref: https://github.com/ggerganov/whisper.cpp/issues/171
 //
+#include "common.h"
+#include "common-sdl.h"
 #include "whisper.h"
 #include <sstream>
 #include <cassert>
 #include <cstdio>
     fprintf(stderr, "\n");
 }
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
     const auto t_start = std::chrono::high_resolution_clock::now();
     std::string line;
     while (std::getline(ifs, line)) {
+        line = ::trim(line);
         if (line.empty()) {
             continue;
         }
     return words;
 }
 // command-list mode
 // guide the transcription to match the most likely command from a provided list
 int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
     // main loop
     while (is_running) {
         // handle Ctrl + C
+        is_running = sdl_poll_events();
         // delay
         std::this_thread::sleep_for(std::chrono::milliseconds(100));
         audio.get(2000, pcmf32_cur);
+        if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
             fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
             const auto t_start = std::chrono::high_resolution_clock::now();
     // main loop
     while (is_running) {
         // handle Ctrl + C
+        is_running = sdl_poll_events();
         // delay
         std::this_thread::sleep_for(std::chrono::milliseconds(100));
         {
             audio.get(2000, pcmf32_cur);
+            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                 fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                 int64_t t_ms = 0;
     // main loop
     while (is_running) {
         // handle Ctrl + C
+        is_running = sdl_poll_events();
         // delay
         std::this_thread::sleep_for(std::chrono::milliseconds(100));
         {
             audio.get(2000, pcmf32_cur);
+            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                 fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                 int64_t t_ms = 0;

examples/common-sdl.cpp ADDED Viewed

	@@ -0,0 +1,226 @@

+#include "common-sdl.h"
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+    m_running = false;
+}
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
+    }
+}
+bool audio_async::init(int capture_id, int sample_rate) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return false;
+    }
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+        }
+    }
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);
+    capture_spec_requested.freq     = sample_rate;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        m_dev_id_in = 0;
+        return false;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+    }
+    m_sample_rate = capture_spec_obtained.freq;
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+    return true;
+}
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+    m_running = true;
+    return true;
+}
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+    m_running = false;
+    return true;
+}
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        m_audio_pos = 0;
+        m_audio_len = 0;
+    }
+    return true;
+}
+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+    const size_t n_samples = len / sizeof(float);
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+    result.clear();
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+        result.resize(n_samples);
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+bool sdl_poll_events() {
+    SDL_Event event;
+    while (SDL_PollEvent(&event)) {
+        switch (event.type) {
+            case SDL_QUIT:
+                {
+                    return false;
+                } break;
+            default:
+                break;
+        }
+    }
+    return true;
+}

examples/common-sdl.h ADDED Viewed

	@@ -0,0 +1,50 @@

+#pragma once
+#include <SDL.h>
+#include <SDL_audio.h>
+#include <atomic>
+#include <cstdint>
+#include <vector>
+#include <mutex>
+//
+// SDL Audio capture
+//
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();
+    bool init(int capture_id, int sample_rate);
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+    std::atomic_bool m_running;
+    std::mutex       m_mutex;
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+// Return false if need to quit
+bool sdl_poll_events();

examples/common.cpp ADDED Viewed

	@@ -0,0 +1,162 @@

+#include "common.h"
+// third-party utilities
+// use your favorite implementations
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
+#include <cmath>
+#include <regex>
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+std::string replace(const std::string & s, const std::string & from, const std::string & to) {
+    std::string result = s;
+    size_t pos = 0;
+    while ((pos = result.find(from, pos)) != std::string::npos) {
+        result.replace(pos, from.length(), to);
+        pos += to.length();
+    }
+    return result;
+}
+bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
+    drwav wav;
+    std::vector<uint8_t> wav_data; // used for pipe input from stdin
+    if (fname == "-") {
+        {
+            uint8_t buf[1024];
+            while (true)
+            {
+                const size_t n = fread(buf, 1, sizeof(buf), stdin);
+                if (n == 0) {
+                    break;
+                }
+                wav_data.insert(wav_data.end(), buf, buf + n);
+            }
+        }
+        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
+            fprintf(stderr, "error: failed to open WAV file from stdin\n");
+            return false;
+        }
+        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
+    }
+    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
+        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
+        return false;
+    }
+    if (wav.channels != 1 && wav.channels != 2) {
+        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
+        return false;
+    }
+    if (stereo && wav.channels != 2) {
+        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
+        return false;
+    }
+    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
+        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
+        return false;
+    }
+    if (wav.bitsPerSample != 16) {
+        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
+        return false;
+    }
+    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
+    std::vector<int16_t> pcm16;
+    pcm16.resize(n*wav.channels);
+    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
+    drwav_uninit(&wav);
+    // convert to mono, float
+    pcmf32.resize(n);
+    if (wav.channels == 1) {
+        for (uint64_t i = 0; i < n; i++) {
+            pcmf32[i] = float(pcm16[i])/32768.0f;
+        }
+    } else {
+        for (uint64_t i = 0; i < n; i++) {
+            pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
+        }
+    }
+    if (stereo) {
+        // convert to stereo, float
+        pcmf32s.resize(2);
+        pcmf32s[0].resize(n);
+        pcmf32s[1].resize(n);
+        for (uint64_t i = 0; i < n; i++) {
+            pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
+            pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
+        }
+    }
+    return true;
+}
+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+    float y = data[0];
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+    for (int i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+    return true;
+}

examples/common.h ADDED Viewed

	@@ -0,0 +1,40 @@

+#pragma once
+// needs to match WHISPER_SAMPLE_RATE
+#define COMMON_SAMPLE_RATE 16000
+#include <vector>
+#include <string>
+std::string trim(const std::string & s);
+std::string replace(
+        const std::string & s,
+        const std::string & from,
+        const std::string & to);
+// Read WAV audio file and store the PCM data into pcmf32
+// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
+// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
+bool read_wav(
+        const std::string & fname,
+        std::vector<float> & pcmf32,
+        std::vector<std::vector<float>> & pcmf32s,
+        bool stereo);
+// Apply a high-pass frequency filter to PCM audio
+// Suppresses frequencies below cutoff Hz
+void high_pass_filter(
+        std::vector<float> & data,
+        float cutoff,
+        float sample_rate);
+// Basic voice activity detection (VAD) using audio energy adaptive threshold
+bool vad_simple(
+        std::vector<float> & pcmf32,
+        int   sample_rate,
+        int   last_ms,
+        float vad_thold,
+        float freq_thold,
+        bool  verbose);

examples/main/CMakeLists.txt CHANGED Viewed

@@ -3,4 +3,4 @@ add_executable(${TARGET} main.cpp)
 include(DefaultTargetOptions)
-target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})


3
4	include(DefaultTargetOptions)
5
6	+ target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})

examples/main/main.cpp CHANGED Viewed

@@ -1,9 +1,6 @@
-#include "whisper.h"
-// third-party utilities
-// use your favorite implementations
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
 #include <cmath>
 #include <fstream>
@@ -86,7 +83,7 @@ struct whisper_params {
     std::string model    = "models/ggml-base.en.bin";
     std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_outp = {};
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -126,7 +123,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
         else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
         else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
-        else if (arg == "-of"   || arg == "--output-file")    { params.fname_outp.emplace_back(argv[++i]); }
         else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
         else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
         else if (arg == "-pp"   || arg == "--print-progress") { params.print_progress = true; }
@@ -520,91 +517,14 @@ int main(int argc, char ** argv) {
     for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
         const auto fname_inp = params.fname_inp[f];
-		const auto fname_outp = f < (int) params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
-        std::vector<float> pcmf32; // mono-channel F32 PCM
         std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-        // WAV input
-        {
-            drwav wav;
-            std::vector<uint8_t> wav_data; // used for pipe input from stdin
-            if (fname_inp == "-") {
-                {
-                    uint8_t buf[1024];
-                    while (true)
-                    {
-                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
-                        if (n == 0) {
-                            break;
-                        }
-                        wav_data.insert(wav_data.end(), buf, buf + n);
-                    }
-                }
-                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
-                    return 4;
-                }
-                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
-            }
-            else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
-                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
-                return 5;
-            }
-            if (wav.channels != 1 && wav.channels != 2) {
-                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
-                return 6;
-            }
-            if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
-                fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
-                return 6;
-            }
-            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
-                fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", argv[0], fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
-                return 8;
-            }
-            if (wav.bitsPerSample != 16) {
-                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
-                return 9;
-            }
-            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
-            std::vector<int16_t> pcm16;
-            pcm16.resize(n*wav.channels);
-            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-            drwav_uninit(&wav);
-            // convert to mono, float
-            pcmf32.resize(n);
-            if (wav.channels == 1) {
-                for (uint64_t i = 0; i < n; i++) {
-                    pcmf32[i] = float(pcm16[i])/32768.0f;
-                }
-            } else {
-                for (uint64_t i = 0; i < n; i++) {
-                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
-                }
-            }
-            if (params.diarize) {
-                // convert to stereo, float
-                pcmf32s.resize(2);
-                pcmf32s[0].resize(n);
-                pcmf32s[1].resize(n);
-                for (uint64_t i = 0; i < n; i++) {
-                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
-                }
-            }
         }
         // print system information
@@ -701,34 +621,33 @@ int main(int argc, char ** argv) {
             // output to text file
             if (params.output_txt) {
-                const auto fname_txt = fname_outp + ".txt";
                 output_txt(ctx, fname_txt.c_str());
             }
             // output to VTT file
             if (params.output_vtt) {
-                const auto fname_vtt = fname_outp + ".vtt";
                 output_vtt(ctx, fname_vtt.c_str());
             }
             // output to SRT file
             if (params.output_srt) {
-                const auto fname_srt = fname_outp + ".srt";
                 output_srt(ctx, fname_srt.c_str(), params);
             }
             // output to WTS file
             if (params.output_wts) {
-                const auto fname_wts = fname_outp + ".wts";
                 output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
             }
-	    // output to CSV file
             if (params.output_csv) {
-                const auto fname_csv = fname_outp + ".csv";
                 output_csv(ctx, fname_csv.c_str());
             }
         }
     }

+#include "common.h"
+#include "whisper.h"
 #include <cmath>
 #include <fstream>
     std::string model    = "models/ggml-base.en.bin";
     std::vector<std::string> fname_inp = {};
+    std::vector<std::string> fname_out = {};
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
         else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
         else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
         else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
+        else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
         else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
         else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
         else if (arg == "-pp"   || arg == "--print-progress") { params.print_progress = true; }
     for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
         const auto fname_inp = params.fname_inp[f];
+		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
+        std::vector<float> pcmf32;               // mono-channel F32 PCM
         std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
+            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
+            continue;
         }
         // print system information
             // output to text file
             if (params.output_txt) {
+                const auto fname_txt = fname_out + ".txt";
                 output_txt(ctx, fname_txt.c_str());
             }
             // output to VTT file
             if (params.output_vtt) {
+                const auto fname_vtt = fname_out + ".vtt";
                 output_vtt(ctx, fname_vtt.c_str());
             }
             // output to SRT file
             if (params.output_srt) {
+                const auto fname_srt = fname_out + ".srt";
                 output_srt(ctx, fname_srt.c_str(), params);
             }
             // output to WTS file
             if (params.output_wts) {
+                const auto fname_wts = fname_out + ".wts";
                 output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
             }
+            // output to CSV file
             if (params.output_csv) {
+                const auto fname_csv = fname_out + ".csv";
                 output_csv(ctx, fname_csv.c_str());
             }
         }
     }

examples/stream/CMakeLists.txt CHANGED Viewed

@@ -5,6 +5,5 @@ if (WHISPER_SUPPORT_SDL2)
     include(DefaultTargetOptions)
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
-    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()

     include(DefaultTargetOptions)
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
 endif ()

examples/stream/stream.cpp CHANGED Viewed

@@ -3,19 +3,16 @@
 // A very quick-n-dirty implementation serving mainly as a proof of concept.
 //
 #include "whisper.h"
-#include <SDL.h>
-#include <SDL_audio.h>
-#include <atomic>
 #include <cassert>
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
 #include <fstream>
-#include <mutex>
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
@@ -116,306 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "\n");
 }
-//
-// SDL Audio capture
-//
-class audio_async {
-public:
-    audio_async(int len_ms);
-    ~audio_async();
-    bool init(int capture_id, int sample_rate);
-    // start capturing audio via the provided SDL callback
-    // keep last len_ms seconds of audio in a circular buffer
-    bool resume();
-    bool pause();
-    bool clear();
-    // callback to be called by SDL
-    void callback(uint8_t * stream, int len);
-    // get audio data from the circular buffer
-    void get(int ms, std::vector<float> & audio);
-private:
-    SDL_AudioDeviceID m_dev_id_in = 0;
-    int m_len_ms = 0;
-    int m_sample_rate = 0;
-    std::atomic_bool m_running;
-    std::mutex       m_mutex;
-    std::vector<float> m_audio;
-    std::vector<float> m_audio_new;
-    size_t             m_audio_pos = 0;
-    size_t             m_audio_len = 0;
-};
-audio_async::audio_async(int len_ms) {
-    m_len_ms = len_ms;
-    m_running = false;
-}
-audio_async::~audio_async() {
-    if (m_dev_id_in) {
-        SDL_CloseAudioDevice(m_dev_id_in);
-    }
-}
-bool audio_async::init(int capture_id, int sample_rate) {
-    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
-    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
-        return false;
-    }
-    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
-    {
-        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
-        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
-        for (int i = 0; i < nDevices; i++) {
-            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
-        }
-    }
-    SDL_AudioSpec capture_spec_requested;
-    SDL_AudioSpec capture_spec_obtained;
-    SDL_zero(capture_spec_requested);
-    SDL_zero(capture_spec_obtained);
-    capture_spec_requested.freq     = sample_rate;
-    capture_spec_requested.format   = AUDIO_F32;
-    capture_spec_requested.channels = 1;
-    capture_spec_requested.samples  = 1024;
-    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
-        audio_async * audio = (audio_async *) userdata;
-        audio->callback(stream, len);
-    };
-    capture_spec_requested.userdata = this;
-    if (capture_id >= 0) {
-        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
-        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    } else {
-        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
-        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    }
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
-        m_dev_id_in = 0;
-        return false;
-    } else {
-        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
-        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
-        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
-                capture_spec_requested.format);
-        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
-                capture_spec_requested.channels);
-        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
-    }
-    m_sample_rate = capture_spec_obtained.freq;
-    m_audio.resize((m_sample_rate*m_len_ms)/1000);
-    return true;
-}
-bool audio_async::resume() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
-        return false;
-    }
-    if (m_running) {
-        fprintf(stderr, "%s: already running!\n", __func__);
-        return false;
-    }
-    SDL_PauseAudioDevice(m_dev_id_in, 0);
-    m_running = true;
-    return true;
-}
-bool audio_async::pause() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
-        return false;
-    }
-    if (!m_running) {
-        fprintf(stderr, "%s: already paused!\n", __func__);
-        return false;
-    }
-    SDL_PauseAudioDevice(m_dev_id_in, 1);
-    m_running = false;
-    return true;
-}
-bool audio_async::clear() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
-        return false;
-    }
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return false;
-    }
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        m_audio_pos = 0;
-        m_audio_len = 0;
-    }
-    return true;
-}
-// callback to be called by SDL
-void audio_async::callback(uint8_t * stream, int len) {
-    if (!m_running) {
-        return;
-    }
-    const size_t n_samples = len / sizeof(float);
-    m_audio_new.resize(n_samples);
-    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
-    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        if (m_audio_pos + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - m_audio_pos;
-            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
-            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = m_audio.size();
-        } else {
-            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
-        }
-    }
-}
-void audio_async::get(int ms, std::vector<float> & result) {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
-        return;
-    }
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return;
-    }
-    result.clear();
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        if (ms <= 0) {
-            ms = m_len_ms;
-        }
-        size_t n_samples = (m_sample_rate * ms) / 1000;
-        if (n_samples > m_audio_len) {
-            n_samples = m_audio_len;
-        }
-        result.resize(n_samples);
-        int s0 = m_audio_pos - n_samples;
-        if (s0 < 0) {
-            s0 += m_audio.size();
-        }
-        if (s0 + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - s0;
-            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
-            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
-        } else {
-            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
-        }
-    }
-}
-///////////////////////////
-void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
-    const float rc = 1.0f / (2.0f * M_PI * cutoff);
-    const float dt = 1.0f / sample_rate;
-    const float alpha = dt / (rc + dt);
-    float y = data[0];
-    for (size_t i = 1; i < data.size(); i++) {
-        y = alpha * (y + data[i] - data[i - 1]);
-        data[i] = y;
-    }
-}
-bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
-    const int n_samples      = pcmf32.size();
-    const int n_samples_last = (sample_rate * last_ms) / 1000;
-    if (n_samples_last >= n_samples) {
-        // not enough samples - assume no speech
-        return false;
-    }
-    if (freq_thold > 0.0f) {
-        high_pass_filter(pcmf32, freq_thold, sample_rate);
-    }
-    float energy_all  = 0.0f;
-    float energy_last = 0.0f;
-    for (int i = 0; i < n_samples; i++) {
-        energy_all += fabsf(pcmf32[i]);
-        if (i >= n_samples - n_samples_last) {
-            energy_last += fabsf(pcmf32[i]);
-        }
-    }
-    energy_all  /= n_samples;
-    energy_last /= n_samples_last;
-    if (verbose) {
-        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
-    }
-    if (energy_last > vad_thold*energy_all) {
-        return false;
-    }
-    return true;
-}
 int main(int argc, char ** argv) {
     whisper_params params;
@@ -426,10 +123,10 @@ int main(int argc, char ** argv) {
     params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
     params.length_ms = std::max(params.length_ms, params.step_ms);
-    const int n_samples_step = (params.step_ms  *1e-3)*WHISPER_SAMPLE_RATE;
-    const int n_samples_len  = (params.length_ms*1e-3)*WHISPER_SAMPLE_RATE;
-    const int n_samples_keep = (params.keep_ms  *1e-3)*WHISPER_SAMPLE_RATE;
-    const int n_samples_30s  = (30000           *1e-3)*WHISPER_SAMPLE_RATE;
     const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
@@ -517,23 +214,7 @@ int main(int argc, char ** argv) {
     // main audio loop
     while (is_running) {
         // handle Ctrl + C
-        {
-            SDL_Event event;
-            while (SDL_PollEvent(&event)) {
-                switch (event.type) {
-                    case SDL_QUIT:
-                        {
-                            is_running = false;
-                        } break;
-                    default:
-                        break;
-                }
-            }
-            if (!is_running) {
-                break;
-            }
-        }
         if (!is_running) {
             break;
@@ -556,7 +237,7 @@ int main(int argc, char ** argv) {
                     break;
                 }
-                SDL_Delay(1);
             }
             const int n_samples_new = pcmf32_new.size();
@@ -587,7 +268,7 @@ int main(int argc, char ** argv) {
             audio.get(2000, pcmf32_new);
-            if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
                 audio.get(params.length_ms, pcmf32);
             } else {
                 std::this_thread::sleep_for(std::chrono::milliseconds(100));

 // A very quick-n-dirty implementation serving mainly as a proof of concept.
 //
+#include "common.h"
+#include "common-sdl.h"
 #include "whisper.h"
 #include <cassert>
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
 #include <fstream>
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
     fprintf(stderr, "\n");
 }
 int main(int argc, char ** argv) {
     whisper_params params;
     params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
     params.length_ms = std::max(params.length_ms, params.step_ms);
+    const int n_samples_step = (1e-3*params.step_ms  )*WHISPER_SAMPLE_RATE;
+    const int n_samples_len  = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
+    const int n_samples_keep = (1e-3*params.keep_ms  )*WHISPER_SAMPLE_RATE;
+    const int n_samples_30s  = (1e-3*30000.0         )*WHISPER_SAMPLE_RATE;
     const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
     // main audio loop
     while (is_running) {
         // handle Ctrl + C
+        is_running = sdl_poll_events();
         if (!is_running) {
             break;
                     break;
                 }
+                std::this_thread::sleep_for(std::chrono::milliseconds(1));
             }
             const int n_samples_new = pcmf32_new.size();
             audio.get(2000, pcmf32_new);
+            if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
                 audio.get(params.length_ms, pcmf32);
             } else {
                 std::this_thread::sleep_for(std::chrono::milliseconds(100));

examples/talk/CMakeLists.txt CHANGED Viewed

@@ -7,7 +7,7 @@ if (WHISPER_SUPPORT_SDL2)
     # TODO: this is temporary
     #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk.cpp gpt-2.cpp ../../ggml.c ../../whisper.cpp)
     include(DefaultTargetOptions)

     # TODO: this is temporary
     #       need to export ggml symbols for MSVC, but too lazy ..
+    add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
     include(DefaultTargetOptions)

examples/talk/talk.cpp CHANGED Viewed

@@ -1,16 +1,14 @@
 // Talk with AI
 //
 #include "whisper.h"
 #include "gpt-2.h"
-#include <SDL.h>
-#include <SDL_audio.h>
 #include <cassert>
 #include <cstdio>
 #include <fstream>
-#include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@@ -105,320 +103,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "\n");
 }
-//
-// SDL Audio capture
-//
-class audio_async {
-public:
-    audio_async(int len_ms);
-    ~audio_async();
-    bool init(int capture_id, int sample_rate);
-    // start capturing audio via the provided SDL callback
-    // keep last len_ms seconds of audio in a circular buffer
-    bool resume();
-    bool pause();
-    bool clear();
-    // callback to be called by SDL
-    void callback(uint8_t * stream, int len);
-    // get audio data from the circular buffer
-    void get(int ms, std::vector<float> & audio);
-private:
-    SDL_AudioDeviceID m_dev_id_in = 0;
-    int m_len_ms = 0;
-    int m_sample_rate = 0;
-    bool       m_running = false;
-    std::mutex m_mutex;
-    std::vector<float> m_audio;
-    std::vector<float> m_audio_new;
-    size_t             m_audio_pos = 0;
-    size_t             m_audio_len = 0;
-};
-audio_async::audio_async(int len_ms) {
-    m_len_ms = len_ms;
-}
-audio_async::~audio_async() {
-    if (m_dev_id_in) {
-        SDL_CloseAudioDevice(m_dev_id_in);
-    }
-}
-bool audio_async::init(int capture_id, int sample_rate) {
-    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
-    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
-        return false;
-    }
-    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
-    {
-        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
-        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
-        for (int i = 0; i < nDevices; i++) {
-            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
-        }
-    }
-    SDL_AudioSpec capture_spec_requested;
-    SDL_AudioSpec capture_spec_obtained;
-    SDL_zero(capture_spec_requested);
-    SDL_zero(capture_spec_obtained);
-    capture_spec_requested.freq     = sample_rate;
-    capture_spec_requested.format   = AUDIO_F32;
-    capture_spec_requested.channels = 1;
-    capture_spec_requested.samples  = 1024;
-    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
-        audio_async * audio = (audio_async *) userdata;
-        audio->callback(stream, len);
-    };
-    capture_spec_requested.userdata = this;
-    if (capture_id >= 0) {
-        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
-        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    } else {
-        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
-        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    }
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
-        m_dev_id_in = 0;
-        return false;
-    } else {
-        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
-        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
-        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
-                capture_spec_requested.format);
-        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
-                capture_spec_requested.channels);
-        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
-        fprintf(stderr, "\n");
-    }
-    m_sample_rate = capture_spec_obtained.freq;
-    m_audio.resize((m_sample_rate*m_len_ms)/1000);
-    return true;
-}
-bool audio_async::resume() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
-        return false;
-    }
-    if (m_running) {
-        fprintf(stderr, "%s: already running!\n", __func__);
-        return false;
-    }
-    SDL_PauseAudioDevice(m_dev_id_in, 0);
-    m_running = true;
-    return true;
-}
-bool audio_async::pause() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
-        return false;
-    }
-    if (!m_running) {
-        fprintf(stderr, "%s: already paused!\n", __func__);
-        return false;
-    }
-    SDL_PauseAudioDevice(m_dev_id_in, 1);
-    m_running = false;
-    return true;
-}
-bool audio_async::clear() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
-        return false;
-    }
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return false;
-    }
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        m_audio_pos = 0;
-        m_audio_len = 0;
-    }
-    return true;
-}
-// callback to be called by SDL
-void audio_async::callback(uint8_t * stream, int len) {
-    if (!m_running) {
-        return;
-    }
-    const size_t n_samples = len / sizeof(float);
-    m_audio_new.resize(n_samples);
-    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
-    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        if (m_audio_pos + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - m_audio_pos;
-            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
-            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = m_audio.size();
-        } else {
-            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
-        }
-    }
-}
-void audio_async::get(int ms, std::vector<float> & result) {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
-        return;
-    }
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return;
-    }
-    result.clear();
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        if (ms <= 0) {
-            ms = m_len_ms;
-        }
-        size_t n_samples = (m_sample_rate * ms) / 1000;
-        if (n_samples > m_audio_len) {
-            n_samples = m_audio_len;
-        }
-        result.resize(n_samples);
-        int s0 = m_audio_pos - n_samples;
-        if (s0 < 0) {
-            s0 += m_audio.size();
-        }
-        if (s0 + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - s0;
-            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
-            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
-        } else {
-            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
-        }
-    }
-}
-///////////////////////////
-std::string trim(const std::string & s) {
-    std::regex e("^\\s+|\\s+$");
-    return std::regex_replace(s, e, "");
-}
-std::string replace(const std::string & s, const std::string & from, const std::string & to) {
-    std::string result = s;
-    size_t pos = 0;
-    while ((pos = result.find(from, pos)) != std::string::npos) {
-        result.replace(pos, from.length(), to);
-        pos += to.length();
-    }
-    return result;
-}
-void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
-    const float rc = 1.0f / (2.0f * M_PI * cutoff);
-    const float dt = 1.0f / sample_rate;
-    const float alpha = dt / (rc + dt);
-    float y = data[0];
-    for (size_t i = 1; i < data.size(); i++) {
-        y = alpha * (y + data[i] - data[i - 1]);
-        data[i] = y;
-    }
-}
-bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
-    const int n_samples      = pcmf32.size();
-    const int n_samples_last = (sample_rate * last_ms) / 1000;
-    if (n_samples_last >= n_samples) {
-        // not enough samples - assume no speech
-        return false;
-    }
-    if (freq_thold > 0.0f) {
-        high_pass_filter(pcmf32, freq_thold, sample_rate);
-    }
-    float energy_all  = 0.0f;
-    float energy_last = 0.0f;
-    for (int i = 0; i < n_samples; i++) {
-        energy_all += fabsf(pcmf32[i]);
-        if (i >= n_samples - n_samples_last) {
-            energy_last += fabsf(pcmf32[i]);
-        }
-    }
-    energy_all  /= n_samples;
-    energy_last /= n_samples_last;
-    if (verbose) {
-        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
-    }
-    if (energy_last > vad_thold*energy_all) {
-        return false;
-    }
-    return true;
-}
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
     const auto t_start = std::chrono::high_resolution_clock::now();
@@ -557,22 +241,10 @@ int main(int argc, char ** argv) {
     // main loop
     while (is_running) {
         // handle Ctrl + C
-        {
-            SDL_Event event;
-            while (SDL_PollEvent(&event)) {
-                switch (event.type) {
-                    case SDL_QUIT:
-                        {
-                            is_running = false;
-                        } break;
-                    default:
-                        break;
-                }
-            }
-            if (!is_running) {
-                break;
-            }
         }
         // delay
@@ -583,7 +255,7 @@ int main(int argc, char ** argv) {
         {
             audio.get(2000, pcmf32_cur);
-            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
                 fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                 audio.get(params.voice_ms, pcmf32_cur);

 // Talk with AI
 //
+#include "common.h"
+#include "common-sdl.h"
 #include "whisper.h"
 #include "gpt-2.h"
 #include <cassert>
 #include <cstdio>
 #include <fstream>
 #include <regex>
 #include <string>
 #include <thread>
     fprintf(stderr, "\n");
 }
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
     const auto t_start = std::chrono::high_resolution_clock::now();
     // main loop
     while (is_running) {
         // handle Ctrl + C
+        is_running = sdl_poll_events();
+        if (!is_running) {
+            break;
         }
         // delay
         {
             audio.get(2000, pcmf32_cur);
+            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
                 fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                 audio.get(params.voice_ms, pcmf32_cur);