whisper.cpp

Running

App Files Files Community

ggerganov commited on Dec 19, 2022

Commit

7ffa531

unverified ·

1 Parent(s): 061b71e

cmake : enable and fix -Wall -Wextra -Wpedantic C++ warnings

Browse files

Files changed (8) hide show

CMakeLists.txt +6 -0
examples/bench/bench.cpp +1 -1
examples/command/command.cpp +6 -6
examples/main/main.cpp +3 -6
examples/stream/stream.cpp +2 -2
examples/talk/gpt-2.cpp +3 -3
examples/talk/talk.cpp +2 -3
whisper.cpp +1 -8

CMakeLists.txt CHANGED Viewed

@@ -132,6 +132,12 @@ if (WHISPER_ALL_WARNINGS)
             -Wstrict-prototypes             \
             -Wpointer-arith                 \
         ")
     else()
         # todo : msvc
     endif()

             -Wstrict-prototypes             \
             -Wpointer-arith                 \
         ")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
+            -Wall                           \
+            -Wextra                         \
+            -Wpedantic                      \
+            -Wcast-qual                     \
+        ")
     else()
         # todo : msvc
     endif()

examples/bench/bench.cpp CHANGED Viewed

@@ -33,7 +33,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
     return true;
 }
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");

     return true;
 }
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");

examples/command/command.cpp CHANGED Viewed

@@ -81,7 +81,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
     return true;
 }
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
@@ -387,7 +387,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
     float energy_all  = 0.0f;
     float energy_last = 0.0f;
-    for (size_t i = 0; i < n_samples; i++) {
         energy_all += fabsf(pcmf32[i]);
         if (i >= n_samples - n_samples_last) {
@@ -594,7 +594,7 @@ int main(int argc, char ** argv) {
             whisper_token tokens[1024];
             allowed_tokens.emplace_back();
-            for (int l = 0; l < cmd.size(); ++l) {
                 // NOTE: very important to add the whitespace !
                 //       the reason is that the first decoded token starts with a whitespace too!
                 std::string ss = std::string(" ") + cmd.substr(0, l + 1);
@@ -843,15 +843,15 @@ int main(int argc, char ** argv) {
                 // best command
                 {
                     fprintf(stdout, "\n");
                     fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
                             "\033[1m", allowed_commands[probs_id[0].second].c_str(), "\033[0m", probs_id[0].first,
-                            (int) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t_start).count());
                     fprintf(stdout, "\n");
                 }
-                const auto t_end = std::chrono::high_resolution_clock::now();
                 audio.clear();
             }
         }

     return true;
 }
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     float energy_all  = 0.0f;
     float energy_last = 0.0f;
+    for (int i = 0; i < n_samples; i++) {
         energy_all += fabsf(pcmf32[i]);
         if (i >= n_samples - n_samples_last) {
             whisper_token tokens[1024];
             allowed_tokens.emplace_back();
+            for (int l = 0; l < (int) cmd.size(); ++l) {
                 // NOTE: very important to add the whitespace !
                 //       the reason is that the first decoded token starts with a whitespace too!
                 std::string ss = std::string(" ") + cmd.substr(0, l + 1);
                 // best command
                 {
+                    const auto t_end = std::chrono::high_resolution_clock::now();
                     fprintf(stdout, "\n");
                     fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
                             "\033[1m", allowed_commands[probs_id[0].second].c_str(), "\033[0m", probs_id[0].first,
+                            (int) std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count());
                     fprintf(stdout, "\n");
                 }
                 audio.clear();
             }
         }

examples/main/main.cpp CHANGED Viewed

@@ -129,7 +129,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
     return true;
 }
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
     fprintf(stderr, "\n");
@@ -328,7 +328,7 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
     std::ofstream fout(fname);
     fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
@@ -377,7 +377,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
             txt_ul = "\\ \\ ";
             {
-                int ncnt = 0;
                 for (int k = 0; k < n; ++k) {
                     const auto & token2 = tokens[k];
@@ -401,8 +400,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
                             txt_ul += "\\ ";
                         }
                     }
-                    ncnt += txt.size();
                 }
                 ::replace_all(txt_bg, "'", "\u2019");
@@ -637,7 +634,7 @@ int main(int argc, char ** argv) {
             {
                 static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-                wparams.encoder_begin_callback = [](struct whisper_context * ctx, void * user_data) {
                     bool is_aborted = *(bool*)user_data;
                     return !is_aborted;
                 };

     return true;
 }
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
     fprintf(stderr, "\n");
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
+bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
     std::ofstream fout(fname);
     fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
             txt_ul = "\\ \\ ";
             {
                 for (int k = 0; k < n; ++k) {
                     const auto & token2 = tokens[k];
                             txt_ul += "\\ ";
                         }
                     }
                 }
                 ::replace_all(txt_bg, "'", "\u2019");
             {
                 static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
                     bool is_aborted = *(bool*)user_data;
                     return !is_aborted;
                 };

examples/stream/stream.cpp CHANGED Viewed

@@ -90,7 +90,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
     return true;
 }
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
@@ -391,7 +391,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
     float energy_all  = 0.0f;
     float energy_last = 0.0f;
-    for (size_t i = 0; i < n_samples; i++) {
         energy_all += fabsf(pcmf32[i]);
         if (i >= n_samples - n_samples_last) {

     return true;
 }
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     float energy_all  = 0.0f;
     float energy_last = 0.0f;
+    for (int i = 0; i < n_samples; i++) {
         energy_all += fabsf(pcmf32[i]);
         if (i >= n_samples - n_samples_last) {

examples/talk/gpt-2.cpp CHANGED Viewed

@@ -78,7 +78,7 @@ gpt_vocab::id gpt_sample_top_k_top_p(
         const float * logits,
         int    top_k,
         double top_p,
-        double temp,
         std::mt19937 & rng) {
     int n_logits = vocab.id_to_token.size();
@@ -268,7 +268,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
             fin.read((char *) &len, sizeof(len));
             word.resize(len);
-            fin.read((char *) word.data(), len);
             vocab.token_to_id[word] = i;
             vocab.id_to_token[i] = word;
@@ -884,7 +884,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
     std::string result;
-    for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
         // predict
         if (embd.size() > 0) {
             if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {

         const float * logits,
         int    top_k,
         double top_p,
+        double /*temp*/,
         std::mt19937 & rng) {
     int n_logits = vocab.id_to_token.size();
             fin.read((char *) &len, sizeof(len));
             word.resize(len);
+            fin.read((char *) &word[0], len);
             vocab.token_to_id[word] = i;
             vocab.id_to_token[i] = word;
     std::string result;
+    for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
         // predict
         if (embd.size() > 0) {
             if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {

examples/talk/talk.cpp CHANGED Viewed

@@ -79,7 +79,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
     return true;
 }
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
@@ -397,7 +397,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
     float energy_all  = 0.0f;
     float energy_last = 0.0f;
-    for (size_t i = 0; i < n_samples; i++) {
         energy_all += fabsf(pcmf32[i]);
         if (i >= n_samples - n_samples_last) {
@@ -541,7 +541,6 @@ int main(int argc, char ** argv) {
     bool force_speak = false;
     float prob0 = 0.0f;
-    float prob  = 0.0f;
     std::vector<float> pcmf32_cur;
     std::vector<float> pcmf32_prompt;

     return true;
 }
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     float energy_all  = 0.0f;
     float energy_last = 0.0f;
+    for (int i = 0; i < n_samples; i++) {
         energy_all += fabsf(pcmf32[i]);
         if (i >= n_samples - n_samples_last) {
     bool force_speak = false;
     float prob0 = 0.0f;
     std::vector<float> pcmf32_cur;
     std::vector<float> pcmf32_prompt;

whisper.cpp CHANGED Viewed

@@ -621,7 +621,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
     const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
     size_t ctx_size = 0;
-    size_t ctx_mem_size = 0;
     {
         const auto & hparams = model.hparams;
@@ -730,12 +729,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
             ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
         }
-        ctx_mem_size += n_text_layer*n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_k
-        ctx_mem_size += n_text_layer*n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_v
-        ctx_mem_size += n_text_layer*n_audio_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_cross_k
-        ctx_mem_size += n_text_layer*n_audio_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_cross_v
         ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
         fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
@@ -2043,7 +2036,7 @@ static void fft(const std::vector<float> & in, std::vector<float> & out) {
 static bool log_mel_spectrogram(
     const float * samples,
     const int n_samples,
-    const int sample_rate,
     const int fft_size,
     const int fft_step,
     const int n_mel,

     const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
     size_t ctx_size = 0;
     {
         const auto & hparams = model.hparams;
             ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
         }
         ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
         fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
 static bool log_mel_spectrogram(
     const float * samples,
     const int n_samples,
+    const int /*sample_rate*/,
     const int fft_size,
     const int fft_step,
     const int n_mel,