rmmh commited on
Commit
d6e13b6
·
unverified ·
1 Parent(s): b4085c3

server : implement "verbose_json" format with token details (#1781)

Browse files

* examples/server: implement "verbose_json" format with token details.

This is intended to mirror the format of openai's Python
whisper.transcribe() return values.

* server: don't write WAV to a temporary file if not converting

* server: use std::lock_guard instead of manual lock/unlock

examples/common.cpp CHANGED
@@ -639,6 +639,12 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
639
 
640
  fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
641
  }
 
 
 
 
 
 
642
  else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
643
  fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
644
  return false;
 
639
 
640
  fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
641
  }
642
+ else if (fname.size() > 256 || fname.size() > 40 && fname.substr(0, 4) == "RIFF" && fname.substr(8, 4) == "WAVE") {
643
+ if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
644
+ fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
645
+ return false;
646
+ }
647
+ }
648
  else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
649
  fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
650
  return false;
examples/common.h CHANGED
@@ -136,6 +136,7 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
136
  //
137
 
138
  // Read WAV audio file and store the PCM data into pcmf32
 
139
  // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
140
  // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
141
  bool read_wav(
 
136
  //
137
 
138
  // Read WAV audio file and store the PCM data into pcmf32
139
+ // fname can be a buffer of WAV data instead of a filename
140
  // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
141
  // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
142
  bool read_wav(
examples/server/server.cpp CHANGED
@@ -18,7 +18,7 @@
18
  #endif
19
 
20
  using namespace httplib;
21
- using json = nlohmann::json;
22
 
23
  namespace {
24
 
@@ -556,7 +556,7 @@ int main(int argc, char ** argv) {
556
 
557
  svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
558
  // acquire whisper model mutex lock
559
- whisper_mutex.lock();
560
 
561
  // first check user requested fields of the request
562
  if (!req.has_file("file"))
@@ -564,7 +564,6 @@ int main(int argc, char ** argv) {
564
  fprintf(stderr, "error: no 'file' field in the request\n");
565
  const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
566
  res.set_content(error_resp, "application/json");
567
- whisper_mutex.unlock();
568
  return;
569
  }
570
  auto audio_file = req.get_file_value("file");
@@ -579,35 +578,42 @@ int main(int argc, char ** argv) {
579
  std::vector<float> pcmf32; // mono-channel F32 PCM
580
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
581
 
582
- // write to temporary file
583
- const std::string temp_filename = "whisper_server_temp_file.wav";
584
- std::ofstream temp_file{temp_filename, std::ios::binary};
585
- temp_file << audio_file.content;
586
- temp_file.close();
587
-
588
- // if file is not wav, convert to wav
589
-
590
  if (sparams.ffmpeg_converter) {
 
 
 
 
 
 
 
591
  std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
592
  const bool is_converted = convert_to_wav(temp_filename, error_resp);
593
  if (!is_converted) {
594
  res.set_content(error_resp, "application/json");
595
- whisper_mutex.unlock();
596
  return;
597
  }
598
- }
599
 
600
- // read wav content into pcmf32
601
- if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
602
- fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
603
- const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
604
- res.set_content(error_resp, "application/json");
 
 
 
 
 
605
  std::remove(temp_filename.c_str());
606
- whisper_mutex.unlock();
607
- return;
 
 
 
 
 
 
608
  }
609
- // remove temp file
610
- std::remove(temp_filename.c_str());
611
 
612
  printf("Successfully loaded %s\n", filename.c_str());
613
 
@@ -681,6 +687,7 @@ int main(int argc, char ** argv) {
681
  wparams.logprob_thold = params.logprob_thold;
682
 
683
  wparams.no_timestamps = params.no_timestamps;
 
684
 
685
  whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
686
 
@@ -724,7 +731,6 @@ int main(int argc, char ** argv) {
724
  fprintf(stderr, "%s: failed to process audio\n", argv[0]);
725
  const std::string error_resp = "{\"error\":\"failed to process audio\"}";
726
  res.set_content(error_resp, "application/json");
727
- whisper_mutex.unlock();
728
  return;
729
  }
730
  }
@@ -778,6 +784,43 @@ int main(int argc, char ** argv) {
778
  ss << speaker << text << "\n\n";
779
  }
780
  res.set_content(ss.str(), "text/vtt");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
781
  }
782
  // TODO add more output formats
783
  else
@@ -792,18 +835,14 @@ int main(int argc, char ** argv) {
792
 
793
  // reset params to thier defaults
794
  params = default_params;
795
-
796
- // return whisper model mutex lock
797
- whisper_mutex.unlock();
798
  });
799
  svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
800
- whisper_mutex.lock();
801
  if (!req.has_file("model"))
802
  {
803
  fprintf(stderr, "error: no 'model' field in the request\n");
804
  const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
805
  res.set_content(error_resp, "application/json");
806
- whisper_mutex.unlock();
807
  return;
808
  }
809
  std::string model = req.get_file_value("model").content;
@@ -812,7 +851,6 @@ int main(int argc, char ** argv) {
812
  fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
813
  const std::string error_resp = "{\"error\":\"model not found!\"}";
814
  res.set_content(error_resp, "application/json");
815
- whisper_mutex.unlock();
816
  return;
817
  }
818
 
@@ -835,7 +873,6 @@ int main(int argc, char ** argv) {
835
  res.set_content(success, "application/text");
836
 
837
  // check if the model is in the file system
838
- whisper_mutex.unlock();
839
  });
840
 
841
  svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
 
18
  #endif
19
 
20
  using namespace httplib;
21
+ using json = nlohmann::ordered_json;
22
 
23
  namespace {
24
 
 
556
 
557
  svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
558
  // acquire whisper model mutex lock
559
+ std::lock_guard<std::mutex> lock(whisper_mutex);
560
 
561
  // first check user requested fields of the request
562
  if (!req.has_file("file"))
 
564
  fprintf(stderr, "error: no 'file' field in the request\n");
565
  const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
566
  res.set_content(error_resp, "application/json");
 
567
  return;
568
  }
569
  auto audio_file = req.get_file_value("file");
 
578
  std::vector<float> pcmf32; // mono-channel F32 PCM
579
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
580
 
 
 
 
 
 
 
 
 
581
  if (sparams.ffmpeg_converter) {
582
+ // if file is not wav, convert to wav
583
+ // write to temporary file
584
+ const std::string temp_filename = "whisper_server_temp_file.wav";
585
+ std::ofstream temp_file{temp_filename, std::ios::binary};
586
+ temp_file << audio_file.content;
587
+ temp_file.close();
588
+
589
  std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
590
  const bool is_converted = convert_to_wav(temp_filename, error_resp);
591
  if (!is_converted) {
592
  res.set_content(error_resp, "application/json");
 
593
  return;
594
  }
 
595
 
596
+ // read wav content into pcmf32
597
+ if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
598
+ {
599
+ fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
600
+ const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
601
+ res.set_content(error_resp, "application/json");
602
+ std::remove(temp_filename.c_str());
603
+ return;
604
+ }
605
+ // remove temp file
606
  std::remove(temp_filename.c_str());
607
+ } else {
608
+ if (!::read_wav(audio_file.content, pcmf32, pcmf32s, params.diarize))
609
+ {
610
+ fprintf(stderr, "error: failed to read WAV file\n");
611
+ const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
612
+ res.set_content(error_resp, "application/json");
613
+ return;
614
+ }
615
  }
616
+
 
617
 
618
  printf("Successfully loaded %s\n", filename.c_str());
619
 
 
687
  wparams.logprob_thold = params.logprob_thold;
688
 
689
  wparams.no_timestamps = params.no_timestamps;
690
+ wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
691
 
692
  whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
693
 
 
731
  fprintf(stderr, "%s: failed to process audio\n", argv[0]);
732
  const std::string error_resp = "{\"error\":\"failed to process audio\"}";
733
  res.set_content(error_resp, "application/json");
 
734
  return;
735
  }
736
  }
 
784
  ss << speaker << text << "\n\n";
785
  }
786
  res.set_content(ss.str(), "text/vtt");
787
+ } else if (params.response_format == vjson_format) {
788
+ /* try to match openai/whisper's Python format */
789
+ std::string results = output_str(ctx, params, pcmf32s);
790
+ json jres = json{{"text", results}};
791
+ const int n_segments = whisper_full_n_segments(ctx);
792
+ for (int i = 0; i < n_segments; ++i)
793
+ {
794
+ json segment = json{
795
+ {"id", i},
796
+ {"text", whisper_full_get_segment_text(ctx, i)},
797
+ };
798
+
799
+ if (!params.no_timestamps) {
800
+ segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
801
+ segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
802
+ }
803
+
804
+ const int n_tokens = whisper_full_n_tokens(ctx, i);
805
+ for (int j = 0; j < n_tokens; ++j) {
806
+ whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
807
+ if (token.id >= whisper_token_eot(ctx)) {
808
+ continue;
809
+ }
810
+
811
+ segment["tokens"].push_back(token.id);
812
+ json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
813
+ if (!params.no_timestamps) {
814
+ word["start"] = token.t0 * 0.01;
815
+ word["end"] = token.t1 * 0.01;
816
+ }
817
+ word["probability"] = token.p;
818
+ segment["words"].push_back(word);
819
+ }
820
+ jres["segments"].push_back(segment);
821
+ }
822
+ res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
823
+ "application/json");
824
  }
825
  // TODO add more output formats
826
  else
 
835
 
836
  // reset params to thier defaults
837
  params = default_params;
 
 
 
838
  });
839
  svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
840
+ std::lock_guard<std::mutex> lock(whisper_mutex);
841
  if (!req.has_file("model"))
842
  {
843
  fprintf(stderr, "error: no 'model' field in the request\n");
844
  const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
845
  res.set_content(error_resp, "application/json");
 
846
  return;
847
  }
848
  std::string model = req.get_file_value("model").content;
 
851
  fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
852
  const std::string error_resp = "{\"error\":\"model not found!\"}";
853
  res.set_content(error_resp, "application/json");
 
854
  return;
855
  }
856
 
 
873
  res.set_content(success, "application/text");
874
 
875
  // check if the model is in the file system
 
876
  });
877
 
878
  svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {