sachaarbonel commited on
Commit
606bf70
·
unverified ·
1 Parent(s): 2fd8067

server : hide language probabilities option behind flag (#3328)

Browse files

* examples/server: hide language probabilities option behind flag

* code review

* fix

Files changed (1) hide show
  1. examples/server/server.cpp +20 -11
examples/server/server.cpp CHANGED
@@ -104,6 +104,7 @@ struct whisper_params {
104
  bool flash_attn = false;
105
  bool suppress_nst = false;
106
  bool no_context = false;
 
107
 
108
  std::string language = "en";
109
  std::string prompt = "";
@@ -178,6 +179,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
178
  fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
179
  fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
180
  fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
 
181
  // Voice Activity Detection (VAD) parameters
182
  fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
183
  fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
@@ -237,6 +239,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
237
  else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; }
238
  else if (arg == "-nth" || arg == "--no-speech-thold") { params.no_speech_thold = std::stof(argv[++i]); }
239
  else if (arg == "-nc" || arg == "--no-context") { params.no_context = true; }
 
240
 
241
  // server params
242
  else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
@@ -599,6 +602,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
599
  {
600
  params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
601
  }
 
 
 
 
602
  }
603
 
604
  } // namespace
@@ -1024,23 +1031,25 @@ int main(int argc, char ** argv) {
1024
  } else if (params.response_format == vjson_format) {
1025
  /* try to match openai/whisper's Python format */
1026
  std::string results = output_str(ctx, params, pcmf32s);
1027
- // Get language probabilities
1028
- std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
1029
- const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data());
1030
  json jres = json{
1031
  {"task", params.translate ? "translate" : "transcribe"},
1032
  {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
1033
  {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
1034
  {"text", results},
1035
- {"segments", json::array()},
1036
- {"detected_language", whisper_lang_str_full(detected_lang_id)},
1037
- {"detected_language_probability", lang_probs[detected_lang_id]},
1038
- {"language_probabilities", json::object()}
1039
  };
1040
- // Add all language probabilities
1041
- for (int i = 0; i <= whisper_lang_max_id(); ++i) {
1042
- if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities
1043
- jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
 
 
 
 
 
 
 
 
1044
  }
1045
  }
1046
  const int n_segments = whisper_full_n_segments(ctx);
 
104
  bool flash_attn = false;
105
  bool suppress_nst = false;
106
  bool no_context = false;
107
+ bool no_language_probabilities = false;
108
 
109
  std::string language = "en";
110
  std::string prompt = "";
 
179
  fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
180
  fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
181
  fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
182
+ fprintf(stderr, " -nlp, --no-language-probabilities [%-7s] exclude language probabilities from verbose_json output\n", params.no_language_probabilities ? "true" : "false");
183
  // Voice Activity Detection (VAD) parameters
184
  fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
185
  fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
 
239
  else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; }
240
  else if (arg == "-nth" || arg == "--no-speech-thold") { params.no_speech_thold = std::stof(argv[++i]); }
241
  else if (arg == "-nc" || arg == "--no-context") { params.no_context = true; }
242
+ else if (arg == "-nlp" || arg == "--no-language-probabilities") { params.no_language_probabilities = true; }
243
 
244
  // server params
245
  else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
 
602
  {
603
  params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
604
  }
605
+ if (req.has_file("no_language_probabilities"))
606
+ {
607
+ params.no_language_probabilities = parse_str_to_bool(req.get_file_value("no_language_probabilities").content);
608
+ }
609
  }
610
 
611
  } // namespace
 
1031
  } else if (params.response_format == vjson_format) {
1032
  /* try to match openai/whisper's Python format */
1033
  std::string results = output_str(ctx, params, pcmf32s);
 
 
 
1034
  json jres = json{
1035
  {"task", params.translate ? "translate" : "transcribe"},
1036
  {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
1037
  {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
1038
  {"text", results},
1039
+ {"segments", json::array()}
 
 
 
1040
  };
1041
+ // Only compute language probabilities if requested (expensive operation)
1042
+ if (!params.no_language_probabilities) {
1043
+ std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
1044
+ const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data());
1045
+ jres["detected_language"] = whisper_lang_str_full(detected_lang_id);
1046
+ jres["detected_language_probability"] = lang_probs[detected_lang_id];
1047
+ jres["language_probabilities"] = json::object();
1048
+ // Add all language probabilities
1049
+ for (int i = 0; i <= whisper_lang_max_id(); ++i) {
1050
+ if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities
1051
+ jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
1052
+ }
1053
  }
1054
  }
1055
  const int n_segments = whisper_full_n_segments(ctx);