Spaces:
Running
Running
cmake : enable and fix -Wall -Wextra -Wpedantic C++ warnings
Browse files- CMakeLists.txt +6 -0
- examples/bench/bench.cpp +1 -1
- examples/command/command.cpp +6 -6
- examples/main/main.cpp +3 -6
- examples/stream/stream.cpp +2 -2
- examples/talk/gpt-2.cpp +3 -3
- examples/talk/talk.cpp +2 -3
- whisper.cpp +1 -8
CMakeLists.txt
CHANGED
|
@@ -132,6 +132,12 @@ if (WHISPER_ALL_WARNINGS)
|
|
| 132 |
-Wstrict-prototypes \
|
| 133 |
-Wpointer-arith \
|
| 134 |
")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
else()
|
| 136 |
# todo : msvc
|
| 137 |
endif()
|
|
|
|
| 132 |
-Wstrict-prototypes \
|
| 133 |
-Wpointer-arith \
|
| 134 |
")
|
| 135 |
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
|
| 136 |
+
-Wall \
|
| 137 |
+
-Wextra \
|
| 138 |
+
-Wpedantic \
|
| 139 |
+
-Wcast-qual \
|
| 140 |
+
")
|
| 141 |
else()
|
| 142 |
# todo : msvc
|
| 143 |
endif()
|
examples/bench/bench.cpp
CHANGED
|
@@ -33,7 +33,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 33 |
return true;
|
| 34 |
}
|
| 35 |
|
| 36 |
-
void whisper_print_usage(int argc
|
| 37 |
fprintf(stderr, "\n");
|
| 38 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 39 |
fprintf(stderr, "\n");
|
|
|
|
| 33 |
return true;
|
| 34 |
}
|
| 35 |
|
| 36 |
+
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
|
| 37 |
fprintf(stderr, "\n");
|
| 38 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 39 |
fprintf(stderr, "\n");
|
examples/command/command.cpp
CHANGED
|
@@ -81,7 +81,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 81 |
return true;
|
| 82 |
}
|
| 83 |
|
| 84 |
-
void whisper_print_usage(int argc
|
| 85 |
fprintf(stderr, "\n");
|
| 86 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 87 |
fprintf(stderr, "\n");
|
|
@@ -387,7 +387,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
|
|
| 387 |
float energy_all = 0.0f;
|
| 388 |
float energy_last = 0.0f;
|
| 389 |
|
| 390 |
-
for (
|
| 391 |
energy_all += fabsf(pcmf32[i]);
|
| 392 |
|
| 393 |
if (i >= n_samples - n_samples_last) {
|
|
@@ -594,7 +594,7 @@ int main(int argc, char ** argv) {
|
|
| 594 |
whisper_token tokens[1024];
|
| 595 |
allowed_tokens.emplace_back();
|
| 596 |
|
| 597 |
-
for (int l = 0; l < cmd.size(); ++l) {
|
| 598 |
// NOTE: very important to add the whitespace !
|
| 599 |
// the reason is that the first decoded token starts with a whitespace too!
|
| 600 |
std::string ss = std::string(" ") + cmd.substr(0, l + 1);
|
|
@@ -843,15 +843,15 @@ int main(int argc, char ** argv) {
|
|
| 843 |
|
| 844 |
// best command
|
| 845 |
{
|
|
|
|
|
|
|
| 846 |
fprintf(stdout, "\n");
|
| 847 |
fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
|
| 848 |
"\033[1m", allowed_commands[probs_id[0].second].c_str(), "\033[0m", probs_id[0].first,
|
| 849 |
-
(int) std::chrono::duration_cast<std::chrono::milliseconds>(
|
| 850 |
fprintf(stdout, "\n");
|
| 851 |
}
|
| 852 |
|
| 853 |
-
const auto t_end = std::chrono::high_resolution_clock::now();
|
| 854 |
-
|
| 855 |
audio.clear();
|
| 856 |
}
|
| 857 |
}
|
|
|
|
| 81 |
return true;
|
| 82 |
}
|
| 83 |
|
| 84 |
+
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
|
| 85 |
fprintf(stderr, "\n");
|
| 86 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 87 |
fprintf(stderr, "\n");
|
|
|
|
| 387 |
float energy_all = 0.0f;
|
| 388 |
float energy_last = 0.0f;
|
| 389 |
|
| 390 |
+
for (int i = 0; i < n_samples; i++) {
|
| 391 |
energy_all += fabsf(pcmf32[i]);
|
| 392 |
|
| 393 |
if (i >= n_samples - n_samples_last) {
|
|
|
|
| 594 |
whisper_token tokens[1024];
|
| 595 |
allowed_tokens.emplace_back();
|
| 596 |
|
| 597 |
+
for (int l = 0; l < (int) cmd.size(); ++l) {
|
| 598 |
// NOTE: very important to add the whitespace !
|
| 599 |
// the reason is that the first decoded token starts with a whitespace too!
|
| 600 |
std::string ss = std::string(" ") + cmd.substr(0, l + 1);
|
|
|
|
| 843 |
|
| 844 |
// best command
|
| 845 |
{
|
| 846 |
+
const auto t_end = std::chrono::high_resolution_clock::now();
|
| 847 |
+
|
| 848 |
fprintf(stdout, "\n");
|
| 849 |
fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
|
| 850 |
"\033[1m", allowed_commands[probs_id[0].second].c_str(), "\033[0m", probs_id[0].first,
|
| 851 |
+
(int) std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count());
|
| 852 |
fprintf(stdout, "\n");
|
| 853 |
}
|
| 854 |
|
|
|
|
|
|
|
| 855 |
audio.clear();
|
| 856 |
}
|
| 857 |
}
|
examples/main/main.cpp
CHANGED
|
@@ -129,7 +129,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 129 |
return true;
|
| 130 |
}
|
| 131 |
|
| 132 |
-
void whisper_print_usage(int argc
|
| 133 |
fprintf(stderr, "\n");
|
| 134 |
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
| 135 |
fprintf(stderr, "\n");
|
|
@@ -328,7 +328,7 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
|
|
| 328 |
// karaoke video generation
|
| 329 |
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
|
| 330 |
// TODO: font parameter adjustments
|
| 331 |
-
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params
|
| 332 |
std::ofstream fout(fname);
|
| 333 |
|
| 334 |
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
|
@@ -377,7 +377,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
| 377 |
txt_ul = "\\ \\ ";
|
| 378 |
|
| 379 |
{
|
| 380 |
-
int ncnt = 0;
|
| 381 |
for (int k = 0; k < n; ++k) {
|
| 382 |
const auto & token2 = tokens[k];
|
| 383 |
|
|
@@ -401,8 +400,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
|
| 401 |
txt_ul += "\\ ";
|
| 402 |
}
|
| 403 |
}
|
| 404 |
-
|
| 405 |
-
ncnt += txt.size();
|
| 406 |
}
|
| 407 |
|
| 408 |
::replace_all(txt_bg, "'", "\u2019");
|
|
@@ -637,7 +634,7 @@ int main(int argc, char ** argv) {
|
|
| 637 |
{
|
| 638 |
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
| 639 |
|
| 640 |
-
wparams.encoder_begin_callback = [](struct whisper_context * ctx
|
| 641 |
bool is_aborted = *(bool*)user_data;
|
| 642 |
return !is_aborted;
|
| 643 |
};
|
|
|
|
| 129 |
return true;
|
| 130 |
}
|
| 131 |
|
| 132 |
+
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
|
| 133 |
fprintf(stderr, "\n");
|
| 134 |
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
| 135 |
fprintf(stderr, "\n");
|
|
|
|
| 328 |
// karaoke video generation
|
| 329 |
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
|
| 330 |
// TODO: font parameter adjustments
|
| 331 |
+
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
|
| 332 |
std::ofstream fout(fname);
|
| 333 |
|
| 334 |
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
|
|
|
| 377 |
txt_ul = "\\ \\ ";
|
| 378 |
|
| 379 |
{
|
|
|
|
| 380 |
for (int k = 0; k < n; ++k) {
|
| 381 |
const auto & token2 = tokens[k];
|
| 382 |
|
|
|
|
| 400 |
txt_ul += "\\ ";
|
| 401 |
}
|
| 402 |
}
|
|
|
|
|
|
|
| 403 |
}
|
| 404 |
|
| 405 |
::replace_all(txt_bg, "'", "\u2019");
|
|
|
|
| 634 |
{
|
| 635 |
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
| 636 |
|
| 637 |
+
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
|
| 638 |
bool is_aborted = *(bool*)user_data;
|
| 639 |
return !is_aborted;
|
| 640 |
};
|
examples/stream/stream.cpp
CHANGED
|
@@ -90,7 +90,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 90 |
return true;
|
| 91 |
}
|
| 92 |
|
| 93 |
-
void whisper_print_usage(int argc
|
| 94 |
fprintf(stderr, "\n");
|
| 95 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 96 |
fprintf(stderr, "\n");
|
|
@@ -391,7 +391,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
|
|
| 391 |
float energy_all = 0.0f;
|
| 392 |
float energy_last = 0.0f;
|
| 393 |
|
| 394 |
-
for (
|
| 395 |
energy_all += fabsf(pcmf32[i]);
|
| 396 |
|
| 397 |
if (i >= n_samples - n_samples_last) {
|
|
|
|
| 90 |
return true;
|
| 91 |
}
|
| 92 |
|
| 93 |
+
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
|
| 94 |
fprintf(stderr, "\n");
|
| 95 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 96 |
fprintf(stderr, "\n");
|
|
|
|
| 391 |
float energy_all = 0.0f;
|
| 392 |
float energy_last = 0.0f;
|
| 393 |
|
| 394 |
+
for (int i = 0; i < n_samples; i++) {
|
| 395 |
energy_all += fabsf(pcmf32[i]);
|
| 396 |
|
| 397 |
if (i >= n_samples - n_samples_last) {
|
examples/talk/gpt-2.cpp
CHANGED
|
@@ -78,7 +78,7 @@ gpt_vocab::id gpt_sample_top_k_top_p(
|
|
| 78 |
const float * logits,
|
| 79 |
int top_k,
|
| 80 |
double top_p,
|
| 81 |
-
double temp
|
| 82 |
std::mt19937 & rng) {
|
| 83 |
int n_logits = vocab.id_to_token.size();
|
| 84 |
|
|
@@ -268,7 +268,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 268 |
fin.read((char *) &len, sizeof(len));
|
| 269 |
|
| 270 |
word.resize(len);
|
| 271 |
-
fin.read((char *) word
|
| 272 |
|
| 273 |
vocab.token_to_id[word] = i;
|
| 274 |
vocab.id_to_token[i] = word;
|
|
@@ -884,7 +884,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
|
|
| 884 |
|
| 885 |
std::string result;
|
| 886 |
|
| 887 |
-
for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
|
| 888 |
// predict
|
| 889 |
if (embd.size() > 0) {
|
| 890 |
if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
|
|
|
|
| 78 |
const float * logits,
|
| 79 |
int top_k,
|
| 80 |
double top_p,
|
| 81 |
+
double /*temp*/,
|
| 82 |
std::mt19937 & rng) {
|
| 83 |
int n_logits = vocab.id_to_token.size();
|
| 84 |
|
|
|
|
| 268 |
fin.read((char *) &len, sizeof(len));
|
| 269 |
|
| 270 |
word.resize(len);
|
| 271 |
+
fin.read((char *) &word[0], len);
|
| 272 |
|
| 273 |
vocab.token_to_id[word] = i;
|
| 274 |
vocab.id_to_token[i] = word;
|
|
|
|
| 884 |
|
| 885 |
std::string result;
|
| 886 |
|
| 887 |
+
for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
|
| 888 |
// predict
|
| 889 |
if (embd.size() > 0) {
|
| 890 |
if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
|
examples/talk/talk.cpp
CHANGED
|
@@ -79,7 +79,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 79 |
return true;
|
| 80 |
}
|
| 81 |
|
| 82 |
-
void whisper_print_usage(int argc
|
| 83 |
fprintf(stderr, "\n");
|
| 84 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 85 |
fprintf(stderr, "\n");
|
|
@@ -397,7 +397,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
|
|
| 397 |
float energy_all = 0.0f;
|
| 398 |
float energy_last = 0.0f;
|
| 399 |
|
| 400 |
-
for (
|
| 401 |
energy_all += fabsf(pcmf32[i]);
|
| 402 |
|
| 403 |
if (i >= n_samples - n_samples_last) {
|
|
@@ -541,7 +541,6 @@ int main(int argc, char ** argv) {
|
|
| 541 |
bool force_speak = false;
|
| 542 |
|
| 543 |
float prob0 = 0.0f;
|
| 544 |
-
float prob = 0.0f;
|
| 545 |
|
| 546 |
std::vector<float> pcmf32_cur;
|
| 547 |
std::vector<float> pcmf32_prompt;
|
|
|
|
| 79 |
return true;
|
| 80 |
}
|
| 81 |
|
| 82 |
+
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
|
| 83 |
fprintf(stderr, "\n");
|
| 84 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 85 |
fprintf(stderr, "\n");
|
|
|
|
| 397 |
float energy_all = 0.0f;
|
| 398 |
float energy_last = 0.0f;
|
| 399 |
|
| 400 |
+
for (int i = 0; i < n_samples; i++) {
|
| 401 |
energy_all += fabsf(pcmf32[i]);
|
| 402 |
|
| 403 |
if (i >= n_samples - n_samples_last) {
|
|
|
|
| 541 |
bool force_speak = false;
|
| 542 |
|
| 543 |
float prob0 = 0.0f;
|
|
|
|
| 544 |
|
| 545 |
std::vector<float> pcmf32_cur;
|
| 546 |
std::vector<float> pcmf32_prompt;
|
whisper.cpp
CHANGED
|
@@ -621,7 +621,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
|
|
| 621 |
const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
| 622 |
|
| 623 |
size_t ctx_size = 0;
|
| 624 |
-
size_t ctx_mem_size = 0;
|
| 625 |
|
| 626 |
{
|
| 627 |
const auto & hparams = model.hparams;
|
|
@@ -730,12 +729,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
|
|
| 730 |
ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
|
| 731 |
}
|
| 732 |
|
| 733 |
-
ctx_mem_size += n_text_layer*n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_k
|
| 734 |
-
ctx_mem_size += n_text_layer*n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_v
|
| 735 |
-
|
| 736 |
-
ctx_mem_size += n_text_layer*n_audio_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_cross_k
|
| 737 |
-
ctx_mem_size += n_text_layer*n_audio_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_cross_v
|
| 738 |
-
|
| 739 |
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
|
| 740 |
|
| 741 |
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
|
@@ -2043,7 +2036,7 @@ static void fft(const std::vector<float> & in, std::vector<float> & out) {
|
|
| 2043 |
static bool log_mel_spectrogram(
|
| 2044 |
const float * samples,
|
| 2045 |
const int n_samples,
|
| 2046 |
-
const int sample_rate
|
| 2047 |
const int fft_size,
|
| 2048 |
const int fft_step,
|
| 2049 |
const int n_mel,
|
|
|
|
| 621 |
const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
| 622 |
|
| 623 |
size_t ctx_size = 0;
|
|
|
|
| 624 |
|
| 625 |
{
|
| 626 |
const auto & hparams = model.hparams;
|
|
|
|
| 729 |
ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
|
| 730 |
}
|
| 731 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 732 |
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
|
| 733 |
|
| 734 |
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
|
|
|
| 2036 |
static bool log_mel_spectrogram(
|
| 2037 |
const float * samples,
|
| 2038 |
const int n_samples,
|
| 2039 |
+
const int /*sample_rate*/,
|
| 2040 |
const int fft_size,
|
| 2041 |
const int fft_step,
|
| 2042 |
const int n_mel,
|