Spaces:
Sleeping
Sleeping
server : automatically convert audio on the server (#1539)
Browse files* server : automatically convert audio on the server
* server : remove rebundant comments
* server : automatic conversion refactor
* server : update server readme
* server : remove unnecessary comments and tabs
* server : put back remove calling
* server : apply suggestions from code review
Co-authored-by: Georgi Gerganov <[email protected]>
* server : check ffmpeg before the server lunch
* server : fix indentation
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <[email protected]>
* server : fix function typo calling
* server : fix function typo calling
* server : add warning in readme
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- examples/server/README.md +4 -0
- examples/server/server.cpp +58 -1
examples/server/README.md
CHANGED
|
@@ -43,8 +43,12 @@ options:
|
|
| 43 |
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
|
| 44 |
--host HOST, [127.0.0.1] Hostname/ip-adress for the server
|
| 45 |
--port PORT, [8080 ] Port number for the server
|
|
|
|
| 46 |
```
|
| 47 |
|
|
|
|
|
|
|
|
|
|
| 48 |
## request examples
|
| 49 |
|
| 50 |
**/inference**
|
|
|
|
| 43 |
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
|
| 44 |
--host HOST, [127.0.0.1] Hostname/ip-adress for the server
|
| 45 |
--port PORT, [8080 ] Port number for the server
|
| 46 |
+
--convert, [false ] Convert audio to WAV, requires ffmpeg on the server
|
| 47 |
```
|
| 48 |
|
| 49 |
+
> [!WARNING]
|
| 50 |
+
> **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**
|
| 51 |
+
|
| 52 |
## request examples
|
| 53 |
|
| 54 |
**/inference**
|
examples/server/server.cpp
CHANGED
|
@@ -43,6 +43,8 @@ struct server_params
|
|
| 43 |
int32_t port = 8080;
|
| 44 |
int32_t read_timeout = 600;
|
| 45 |
int32_t write_timeout = 600;
|
|
|
|
|
|
|
| 46 |
};
|
| 47 |
|
| 48 |
struct whisper_params {
|
|
@@ -157,6 +159,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 157 |
fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
|
| 158 |
fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
|
| 159 |
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
|
|
|
|
| 160 |
fprintf(stderr, "\n");
|
| 161 |
}
|
| 162 |
|
|
@@ -203,6 +206,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
|
|
| 203 |
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
|
| 204 |
else if ( arg == "--host") { sparams.hostname = argv[++i]; }
|
| 205 |
else if ( arg == "--public") { sparams.public_path = argv[++i]; }
|
|
|
|
| 206 |
else {
|
| 207 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 208 |
whisper_print_usage(argc, argv, params, sparams);
|
|
@@ -220,6 +224,45 @@ struct whisper_print_user_data {
|
|
| 220 |
int progress_prev;
|
| 221 |
};
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
|
| 224 |
std::string speaker = "";
|
| 225 |
const int64_t n_samples = pcmf32s[0].size();
|
|
@@ -407,6 +450,9 @@ int main(int argc, char ** argv) {
|
|
| 407 |
exit(0);
|
| 408 |
}
|
| 409 |
|
|
|
|
|
|
|
|
|
|
| 410 |
// whisper init
|
| 411 |
struct whisper_context_params cparams;
|
| 412 |
cparams.use_gpu = params.use_gpu;
|
|
@@ -462,6 +508,18 @@ int main(int argc, char ** argv) {
|
|
| 462 |
temp_file << audio_file.content;
|
| 463 |
temp_file.close();
|
| 464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
// read wav content into pcmf32
|
| 466 |
if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
|
| 467 |
fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
|
|
@@ -509,7 +567,6 @@ int main(int argc, char ** argv) {
|
|
| 509 |
|
| 510 |
// run the inference
|
| 511 |
{
|
| 512 |
-
|
| 513 |
printf("Running whisper.cpp inference on %s\n", filename.c_str());
|
| 514 |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 515 |
|
|
|
|
| 43 |
int32_t port = 8080;
|
| 44 |
int32_t read_timeout = 600;
|
| 45 |
int32_t write_timeout = 600;
|
| 46 |
+
|
| 47 |
+
bool ffmpeg_converter = false;
|
| 48 |
};
|
| 49 |
|
| 50 |
struct whisper_params {
|
|
|
|
| 159 |
fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
|
| 160 |
fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
|
| 161 |
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
|
| 162 |
+
fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
|
| 163 |
fprintf(stderr, "\n");
|
| 164 |
}
|
| 165 |
|
|
|
|
| 206 |
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
|
| 207 |
else if ( arg == "--host") { sparams.hostname = argv[++i]; }
|
| 208 |
else if ( arg == "--public") { sparams.public_path = argv[++i]; }
|
| 209 |
+
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
|
| 210 |
else {
|
| 211 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 212 |
whisper_print_usage(argc, argv, params, sparams);
|
|
|
|
| 224 |
int progress_prev;
|
| 225 |
};
|
| 226 |
|
| 227 |
+
void check_ffmpeg_availibility() {
|
| 228 |
+
int result = system("ffmpeg -version");
|
| 229 |
+
|
| 230 |
+
if (result == 0) {
|
| 231 |
+
std::cout << "ffmpeg is available." << std::endl;
|
| 232 |
+
} else {
|
| 233 |
+
// ffmpeg is not available
|
| 234 |
+
std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
|
| 235 |
+
std::cout << "and that its executable is included in your system's PATH. ";
|
| 236 |
+
exit(0);
|
| 237 |
+
}
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
|
| 241 |
+
std::ostringstream cmd_stream;
|
| 242 |
+
std::string converted_filename_temp = temp_filename + "_temp.wav";
|
| 243 |
+
cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
|
| 244 |
+
std::string cmd = cmd_stream.str();
|
| 245 |
+
|
| 246 |
+
int status = std::system(cmd.c_str());
|
| 247 |
+
if (status != 0) {
|
| 248 |
+
error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
|
| 249 |
+
return false;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
// Remove the original file
|
| 253 |
+
if (remove(temp_filename.c_str()) != 0) {
|
| 254 |
+
error_resp = "{\"error\":\"Failed to remove the original file.\"}";
|
| 255 |
+
return false;
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
// Rename the temporary file to match the original filename
|
| 259 |
+
if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
|
| 260 |
+
error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
|
| 261 |
+
return false;
|
| 262 |
+
}
|
| 263 |
+
return true;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
|
| 267 |
std::string speaker = "";
|
| 268 |
const int64_t n_samples = pcmf32s[0].size();
|
|
|
|
| 450 |
exit(0);
|
| 451 |
}
|
| 452 |
|
| 453 |
+
if (sparams.ffmpeg_converter) {
|
| 454 |
+
check_ffmpeg_availibility();
|
| 455 |
+
}
|
| 456 |
// whisper init
|
| 457 |
struct whisper_context_params cparams;
|
| 458 |
cparams.use_gpu = params.use_gpu;
|
|
|
|
| 508 |
temp_file << audio_file.content;
|
| 509 |
temp_file.close();
|
| 510 |
|
| 511 |
+
// if file is not wav, convert to wav
|
| 512 |
+
|
| 513 |
+
if (sparams.ffmpeg_converter) {
|
| 514 |
+
std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
|
| 515 |
+
const bool is_converted = convert_to_wav(temp_filename, error_resp);
|
| 516 |
+
if (!is_converted) {
|
| 517 |
+
res.set_content(error_resp, "application/json");
|
| 518 |
+
whisper_mutex.unlock();
|
| 519 |
+
return;
|
| 520 |
+
}
|
| 521 |
+
}
|
| 522 |
+
|
| 523 |
// read wav content into pcmf32
|
| 524 |
if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
|
| 525 |
fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
|
|
|
|
| 567 |
|
| 568 |
// run the inference
|
| 569 |
{
|
|
|
|
| 570 |
printf("Running whisper.cpp inference on %s\n", filename.c_str());
|
| 571 |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 572 |
|