Spaces:
Sleeping
Sleeping
server : hide language probabilities option behind flag (#3328)
Browse files* examples/server: hide language probabilities option behind flag
* code review
* fix
- examples/server/server.cpp +20 -11
examples/server/server.cpp
CHANGED
|
@@ -104,6 +104,7 @@ struct whisper_params {
|
|
| 104 |
bool flash_attn = false;
|
| 105 |
bool suppress_nst = false;
|
| 106 |
bool no_context = false;
|
|
|
|
| 107 |
|
| 108 |
std::string language = "en";
|
| 109 |
std::string prompt = "";
|
|
@@ -178,6 +179,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 178 |
fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
|
| 179 |
fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
|
| 180 |
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
|
|
|
|
| 181 |
// Voice Activity Detection (VAD) parameters
|
| 182 |
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
|
| 183 |
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
|
|
@@ -237,6 +239,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
|
|
| 237 |
else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; }
|
| 238 |
else if (arg == "-nth" || arg == "--no-speech-thold") { params.no_speech_thold = std::stof(argv[++i]); }
|
| 239 |
else if (arg == "-nc" || arg == "--no-context") { params.no_context = true; }
|
|
|
|
| 240 |
|
| 241 |
// server params
|
| 242 |
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
|
|
@@ -599,6 +602,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
|
|
| 599 |
{
|
| 600 |
params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
|
| 601 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
}
|
| 603 |
|
| 604 |
} // namespace
|
|
@@ -1024,23 +1031,25 @@ int main(int argc, char ** argv) {
|
|
| 1024 |
} else if (params.response_format == vjson_format) {
|
| 1025 |
/* try to match openai/whisper's Python format */
|
| 1026 |
std::string results = output_str(ctx, params, pcmf32s);
|
| 1027 |
-
// Get language probabilities
|
| 1028 |
-
std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
|
| 1029 |
-
const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data());
|
| 1030 |
json jres = json{
|
| 1031 |
{"task", params.translate ? "translate" : "transcribe"},
|
| 1032 |
{"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
|
| 1033 |
{"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
|
| 1034 |
{"text", results},
|
| 1035 |
-
{"segments", json::array()}
|
| 1036 |
-
{"detected_language", whisper_lang_str_full(detected_lang_id)},
|
| 1037 |
-
{"detected_language_probability", lang_probs[detected_lang_id]},
|
| 1038 |
-
{"language_probabilities", json::object()}
|
| 1039 |
};
|
| 1040 |
-
//
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1044 |
}
|
| 1045 |
}
|
| 1046 |
const int n_segments = whisper_full_n_segments(ctx);
|
|
|
|
| 104 |
bool flash_attn = false;
|
| 105 |
bool suppress_nst = false;
|
| 106 |
bool no_context = false;
|
| 107 |
+
bool no_language_probabilities = false;
|
| 108 |
|
| 109 |
std::string language = "en";
|
| 110 |
std::string prompt = "";
|
|
|
|
| 179 |
fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
|
| 180 |
fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
|
| 181 |
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
|
| 182 |
+
fprintf(stderr, " -nlp, --no-language-probabilities [%-7s] exclude language probabilities from verbose_json output\n", params.no_language_probabilities ? "true" : "false");
|
| 183 |
// Voice Activity Detection (VAD) parameters
|
| 184 |
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
|
| 185 |
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
|
|
|
|
| 239 |
else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; }
|
| 240 |
else if (arg == "-nth" || arg == "--no-speech-thold") { params.no_speech_thold = std::stof(argv[++i]); }
|
| 241 |
else if (arg == "-nc" || arg == "--no-context") { params.no_context = true; }
|
| 242 |
+
else if (arg == "-nlp" || arg == "--no-language-probabilities") { params.no_language_probabilities = true; }
|
| 243 |
|
| 244 |
// server params
|
| 245 |
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
|
|
|
|
| 602 |
{
|
| 603 |
params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
|
| 604 |
}
|
| 605 |
+
if (req.has_file("no_language_probabilities"))
|
| 606 |
+
{
|
| 607 |
+
params.no_language_probabilities = parse_str_to_bool(req.get_file_value("no_language_probabilities").content);
|
| 608 |
+
}
|
| 609 |
}
|
| 610 |
|
| 611 |
} // namespace
|
|
|
|
| 1031 |
} else if (params.response_format == vjson_format) {
|
| 1032 |
/* try to match openai/whisper's Python format */
|
| 1033 |
std::string results = output_str(ctx, params, pcmf32s);
|
|
|
|
|
|
|
|
|
|
| 1034 |
json jres = json{
|
| 1035 |
{"task", params.translate ? "translate" : "transcribe"},
|
| 1036 |
{"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
|
| 1037 |
{"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
|
| 1038 |
{"text", results},
|
| 1039 |
+
{"segments", json::array()}
|
|
|
|
|
|
|
|
|
|
| 1040 |
};
|
| 1041 |
+
// Only compute language probabilities if requested (expensive operation)
|
| 1042 |
+
if (!params.no_language_probabilities) {
|
| 1043 |
+
std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
|
| 1044 |
+
const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data());
|
| 1045 |
+
jres["detected_language"] = whisper_lang_str_full(detected_lang_id);
|
| 1046 |
+
jres["detected_language_probability"] = lang_probs[detected_lang_id];
|
| 1047 |
+
jres["language_probabilities"] = json::object();
|
| 1048 |
+
// Add all language probabilities
|
| 1049 |
+
for (int i = 0; i <= whisper_lang_max_id(); ++i) {
|
| 1050 |
+
if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities
|
| 1051 |
+
jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
|
| 1052 |
+
}
|
| 1053 |
}
|
| 1054 |
}
|
| 1055 |
const int n_segments = whisper_full_n_segments(ctx);
|