Spaces:
Running
Running
server : implement "verbose_json" format with token details (#1781)
Browse files* examples/server: implement "verbose_json" format with token details.
This is intended to mirror the format of openai's Python
whisper.transcribe() return values.
* server: don't write WAV to a temporary file if not converting
* server: use std::lock_guard instead of manual lock/unlock
- examples/common.cpp +6 -0
- examples/common.h +1 -0
- examples/server/server.cpp +67 -30
examples/common.cpp
CHANGED
|
@@ -639,6 +639,12 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
|
|
| 639 |
|
| 640 |
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
|
| 641 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
|
| 643 |
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
|
| 644 |
return false;
|
|
|
|
| 639 |
|
| 640 |
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
|
| 641 |
}
|
| 642 |
+
else if (fname.size() > 256 || fname.size() > 40 && fname.substr(0, 4) == "RIFF" && fname.substr(8, 4) == "WAVE") {
|
| 643 |
+
if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
|
| 644 |
+
fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
|
| 645 |
+
return false;
|
| 646 |
+
}
|
| 647 |
+
}
|
| 648 |
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
|
| 649 |
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
|
| 650 |
return false;
|
examples/common.h
CHANGED
|
@@ -136,6 +136,7 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
|
|
| 136 |
//
|
| 137 |
|
| 138 |
// Read WAV audio file and store the PCM data into pcmf32
|
|
|
|
| 139 |
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
|
| 140 |
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
|
| 141 |
bool read_wav(
|
|
|
|
| 136 |
//
|
| 137 |
|
| 138 |
// Read WAV audio file and store the PCM data into pcmf32
|
| 139 |
+
// fname can be a buffer of WAV data instead of a filename
|
| 140 |
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
|
| 141 |
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
|
| 142 |
bool read_wav(
|
examples/server/server.cpp
CHANGED
|
@@ -18,7 +18,7 @@
|
|
| 18 |
#endif
|
| 19 |
|
| 20 |
using namespace httplib;
|
| 21 |
-
using json = nlohmann::
|
| 22 |
|
| 23 |
namespace {
|
| 24 |
|
|
@@ -556,7 +556,7 @@ int main(int argc, char ** argv) {
|
|
| 556 |
|
| 557 |
svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
|
| 558 |
// acquire whisper model mutex lock
|
| 559 |
-
|
| 560 |
|
| 561 |
// first check user requested fields of the request
|
| 562 |
if (!req.has_file("file"))
|
|
@@ -564,7 +564,6 @@ int main(int argc, char ** argv) {
|
|
| 564 |
fprintf(stderr, "error: no 'file' field in the request\n");
|
| 565 |
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
|
| 566 |
res.set_content(error_resp, "application/json");
|
| 567 |
-
whisper_mutex.unlock();
|
| 568 |
return;
|
| 569 |
}
|
| 570 |
auto audio_file = req.get_file_value("file");
|
|
@@ -579,35 +578,42 @@ int main(int argc, char ** argv) {
|
|
| 579 |
std::vector<float> pcmf32; // mono-channel F32 PCM
|
| 580 |
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
| 581 |
|
| 582 |
-
// write to temporary file
|
| 583 |
-
const std::string temp_filename = "whisper_server_temp_file.wav";
|
| 584 |
-
std::ofstream temp_file{temp_filename, std::ios::binary};
|
| 585 |
-
temp_file << audio_file.content;
|
| 586 |
-
temp_file.close();
|
| 587 |
-
|
| 588 |
-
// if file is not wav, convert to wav
|
| 589 |
-
|
| 590 |
if (sparams.ffmpeg_converter) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 591 |
std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
|
| 592 |
const bool is_converted = convert_to_wav(temp_filename, error_resp);
|
| 593 |
if (!is_converted) {
|
| 594 |
res.set_content(error_resp, "application/json");
|
| 595 |
-
whisper_mutex.unlock();
|
| 596 |
return;
|
| 597 |
}
|
| 598 |
-
}
|
| 599 |
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
std::remove(temp_filename.c_str());
|
| 606 |
-
|
| 607 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
}
|
| 609 |
-
|
| 610 |
-
std::remove(temp_filename.c_str());
|
| 611 |
|
| 612 |
printf("Successfully loaded %s\n", filename.c_str());
|
| 613 |
|
|
@@ -681,6 +687,7 @@ int main(int argc, char ** argv) {
|
|
| 681 |
wparams.logprob_thold = params.logprob_thold;
|
| 682 |
|
| 683 |
wparams.no_timestamps = params.no_timestamps;
|
|
|
|
| 684 |
|
| 685 |
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
| 686 |
|
|
@@ -724,7 +731,6 @@ int main(int argc, char ** argv) {
|
|
| 724 |
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
| 725 |
const std::string error_resp = "{\"error\":\"failed to process audio\"}";
|
| 726 |
res.set_content(error_resp, "application/json");
|
| 727 |
-
whisper_mutex.unlock();
|
| 728 |
return;
|
| 729 |
}
|
| 730 |
}
|
|
@@ -778,6 +784,43 @@ int main(int argc, char ** argv) {
|
|
| 778 |
ss << speaker << text << "\n\n";
|
| 779 |
}
|
| 780 |
res.set_content(ss.str(), "text/vtt");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
}
|
| 782 |
// TODO add more output formats
|
| 783 |
else
|
|
@@ -792,18 +835,14 @@ int main(int argc, char ** argv) {
|
|
| 792 |
|
| 793 |
// reset params to thier defaults
|
| 794 |
params = default_params;
|
| 795 |
-
|
| 796 |
-
// return whisper model mutex lock
|
| 797 |
-
whisper_mutex.unlock();
|
| 798 |
});
|
| 799 |
svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
|
| 800 |
-
|
| 801 |
if (!req.has_file("model"))
|
| 802 |
{
|
| 803 |
fprintf(stderr, "error: no 'model' field in the request\n");
|
| 804 |
const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
|
| 805 |
res.set_content(error_resp, "application/json");
|
| 806 |
-
whisper_mutex.unlock();
|
| 807 |
return;
|
| 808 |
}
|
| 809 |
std::string model = req.get_file_value("model").content;
|
|
@@ -812,7 +851,6 @@ int main(int argc, char ** argv) {
|
|
| 812 |
fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
|
| 813 |
const std::string error_resp = "{\"error\":\"model not found!\"}";
|
| 814 |
res.set_content(error_resp, "application/json");
|
| 815 |
-
whisper_mutex.unlock();
|
| 816 |
return;
|
| 817 |
}
|
| 818 |
|
|
@@ -835,7 +873,6 @@ int main(int argc, char ** argv) {
|
|
| 835 |
res.set_content(success, "application/text");
|
| 836 |
|
| 837 |
// check if the model is in the file system
|
| 838 |
-
whisper_mutex.unlock();
|
| 839 |
});
|
| 840 |
|
| 841 |
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
|
|
|
|
| 18 |
#endif
|
| 19 |
|
| 20 |
using namespace httplib;
|
| 21 |
+
using json = nlohmann::ordered_json;
|
| 22 |
|
| 23 |
namespace {
|
| 24 |
|
|
|
|
| 556 |
|
| 557 |
svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
|
| 558 |
// acquire whisper model mutex lock
|
| 559 |
+
std::lock_guard<std::mutex> lock(whisper_mutex);
|
| 560 |
|
| 561 |
// first check user requested fields of the request
|
| 562 |
if (!req.has_file("file"))
|
|
|
|
| 564 |
fprintf(stderr, "error: no 'file' field in the request\n");
|
| 565 |
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
|
| 566 |
res.set_content(error_resp, "application/json");
|
|
|
|
| 567 |
return;
|
| 568 |
}
|
| 569 |
auto audio_file = req.get_file_value("file");
|
|
|
|
| 578 |
std::vector<float> pcmf32; // mono-channel F32 PCM
|
| 579 |
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
| 580 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
if (sparams.ffmpeg_converter) {
|
| 582 |
+
// if file is not wav, convert to wav
|
| 583 |
+
// write to temporary file
|
| 584 |
+
const std::string temp_filename = "whisper_server_temp_file.wav";
|
| 585 |
+
std::ofstream temp_file{temp_filename, std::ios::binary};
|
| 586 |
+
temp_file << audio_file.content;
|
| 587 |
+
temp_file.close();
|
| 588 |
+
|
| 589 |
std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
|
| 590 |
const bool is_converted = convert_to_wav(temp_filename, error_resp);
|
| 591 |
if (!is_converted) {
|
| 592 |
res.set_content(error_resp, "application/json");
|
|
|
|
| 593 |
return;
|
| 594 |
}
|
|
|
|
| 595 |
|
| 596 |
+
// read wav content into pcmf32
|
| 597 |
+
if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
|
| 598 |
+
{
|
| 599 |
+
fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
|
| 600 |
+
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
|
| 601 |
+
res.set_content(error_resp, "application/json");
|
| 602 |
+
std::remove(temp_filename.c_str());
|
| 603 |
+
return;
|
| 604 |
+
}
|
| 605 |
+
// remove temp file
|
| 606 |
std::remove(temp_filename.c_str());
|
| 607 |
+
} else {
|
| 608 |
+
if (!::read_wav(audio_file.content, pcmf32, pcmf32s, params.diarize))
|
| 609 |
+
{
|
| 610 |
+
fprintf(stderr, "error: failed to read WAV file\n");
|
| 611 |
+
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
|
| 612 |
+
res.set_content(error_resp, "application/json");
|
| 613 |
+
return;
|
| 614 |
+
}
|
| 615 |
}
|
| 616 |
+
|
|
|
|
| 617 |
|
| 618 |
printf("Successfully loaded %s\n", filename.c_str());
|
| 619 |
|
|
|
|
| 687 |
wparams.logprob_thold = params.logprob_thold;
|
| 688 |
|
| 689 |
wparams.no_timestamps = params.no_timestamps;
|
| 690 |
+
wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
|
| 691 |
|
| 692 |
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
| 693 |
|
|
|
|
| 731 |
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
| 732 |
const std::string error_resp = "{\"error\":\"failed to process audio\"}";
|
| 733 |
res.set_content(error_resp, "application/json");
|
|
|
|
| 734 |
return;
|
| 735 |
}
|
| 736 |
}
|
|
|
|
| 784 |
ss << speaker << text << "\n\n";
|
| 785 |
}
|
| 786 |
res.set_content(ss.str(), "text/vtt");
|
| 787 |
+
} else if (params.response_format == vjson_format) {
|
| 788 |
+
/* try to match openai/whisper's Python format */
|
| 789 |
+
std::string results = output_str(ctx, params, pcmf32s);
|
| 790 |
+
json jres = json{{"text", results}};
|
| 791 |
+
const int n_segments = whisper_full_n_segments(ctx);
|
| 792 |
+
for (int i = 0; i < n_segments; ++i)
|
| 793 |
+
{
|
| 794 |
+
json segment = json{
|
| 795 |
+
{"id", i},
|
| 796 |
+
{"text", whisper_full_get_segment_text(ctx, i)},
|
| 797 |
+
};
|
| 798 |
+
|
| 799 |
+
if (!params.no_timestamps) {
|
| 800 |
+
segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
|
| 801 |
+
segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
|
| 802 |
+
}
|
| 803 |
+
|
| 804 |
+
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
| 805 |
+
for (int j = 0; j < n_tokens; ++j) {
|
| 806 |
+
whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
|
| 807 |
+
if (token.id >= whisper_token_eot(ctx)) {
|
| 808 |
+
continue;
|
| 809 |
+
}
|
| 810 |
+
|
| 811 |
+
segment["tokens"].push_back(token.id);
|
| 812 |
+
json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
|
| 813 |
+
if (!params.no_timestamps) {
|
| 814 |
+
word["start"] = token.t0 * 0.01;
|
| 815 |
+
word["end"] = token.t1 * 0.01;
|
| 816 |
+
}
|
| 817 |
+
word["probability"] = token.p;
|
| 818 |
+
segment["words"].push_back(word);
|
| 819 |
+
}
|
| 820 |
+
jres["segments"].push_back(segment);
|
| 821 |
+
}
|
| 822 |
+
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
|
| 823 |
+
"application/json");
|
| 824 |
}
|
| 825 |
// TODO add more output formats
|
| 826 |
else
|
|
|
|
| 835 |
|
| 836 |
// reset params to thier defaults
|
| 837 |
params = default_params;
|
|
|
|
|
|
|
|
|
|
| 838 |
});
|
| 839 |
svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
|
| 840 |
+
std::lock_guard<std::mutex> lock(whisper_mutex);
|
| 841 |
if (!req.has_file("model"))
|
| 842 |
{
|
| 843 |
fprintf(stderr, "error: no 'model' field in the request\n");
|
| 844 |
const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
|
| 845 |
res.set_content(error_resp, "application/json");
|
|
|
|
| 846 |
return;
|
| 847 |
}
|
| 848 |
std::string model = req.get_file_value("model").content;
|
|
|
|
| 851 |
fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
|
| 852 |
const std::string error_resp = "{\"error\":\"model not found!\"}";
|
| 853 |
res.set_content(error_resp, "application/json");
|
|
|
|
| 854 |
return;
|
| 855 |
}
|
| 856 |
|
|
|
|
| 873 |
res.set_content(success, "application/text");
|
| 874 |
|
| 875 |
// check if the model is in the file system
|
|
|
|
| 876 |
});
|
| 877 |
|
| 878 |
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
|