Ismatulla Mansurov ggerganov commited on
Commit
99eeacd
·
unverified ·
1 Parent(s): ee64ad8

server : automatically convert audio on the server (#1539)

Browse files

* server : automatically convert audio on the server

* server : remove rebundant comments

* server : automatic conversion refactor

* server : update server readme

* server : remove unnecessary comments and tabs

* server : put back remove calling

* server : apply suggestions from code review

Co-authored-by: Georgi Gerganov <[email protected]>

* server : check ffmpeg before the server lunch

* server : fix indentation

* Apply suggestions from code review

Co-authored-by: Georgi Gerganov <[email protected]>

* server : fix function typo calling

* server : fix function typo calling

* server : add warning in readme

---------

Co-authored-by: Georgi Gerganov <[email protected]>

examples/server/README.md CHANGED
@@ -43,8 +43,12 @@ options:
43
  -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
44
  --host HOST, [127.0.0.1] Hostname/ip-adress for the server
45
  --port PORT, [8080 ] Port number for the server
 
46
  ```
47
 
 
 
 
48
  ## request examples
49
 
50
  **/inference**
 
43
  -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
44
  --host HOST, [127.0.0.1] Hostname/ip-adress for the server
45
  --port PORT, [8080 ] Port number for the server
46
+ --convert, [false ] Convert audio to WAV, requires ffmpeg on the server
47
  ```
48
 
49
+ > [!WARNING]
50
+ > **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**
51
+
52
  ## request examples
53
 
54
  **/inference**
examples/server/server.cpp CHANGED
@@ -43,6 +43,8 @@ struct server_params
43
  int32_t port = 8080;
44
  int32_t read_timeout = 600;
45
  int32_t write_timeout = 600;
 
 
46
  };
47
 
48
  struct whisper_params {
@@ -157,6 +159,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
157
  fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
158
  fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
159
  fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
 
160
  fprintf(stderr, "\n");
161
  }
162
 
@@ -203,6 +206,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
203
  else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
204
  else if ( arg == "--host") { sparams.hostname = argv[++i]; }
205
  else if ( arg == "--public") { sparams.public_path = argv[++i]; }
 
206
  else {
207
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
208
  whisper_print_usage(argc, argv, params, sparams);
@@ -220,6 +224,45 @@ struct whisper_print_user_data {
220
  int progress_prev;
221
  };
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
224
  std::string speaker = "";
225
  const int64_t n_samples = pcmf32s[0].size();
@@ -407,6 +450,9 @@ int main(int argc, char ** argv) {
407
  exit(0);
408
  }
409
 
 
 
 
410
  // whisper init
411
  struct whisper_context_params cparams;
412
  cparams.use_gpu = params.use_gpu;
@@ -462,6 +508,18 @@ int main(int argc, char ** argv) {
462
  temp_file << audio_file.content;
463
  temp_file.close();
464
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  // read wav content into pcmf32
466
  if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
467
  fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
@@ -509,7 +567,6 @@ int main(int argc, char ** argv) {
509
 
510
  // run the inference
511
  {
512
-
513
  printf("Running whisper.cpp inference on %s\n", filename.c_str());
514
  whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
515
 
 
43
  int32_t port = 8080;
44
  int32_t read_timeout = 600;
45
  int32_t write_timeout = 600;
46
+
47
+ bool ffmpeg_converter = false;
48
  };
49
 
50
  struct whisper_params {
 
159
  fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
160
  fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
161
  fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
162
+ fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
163
  fprintf(stderr, "\n");
164
  }
165
 
 
206
  else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
207
  else if ( arg == "--host") { sparams.hostname = argv[++i]; }
208
  else if ( arg == "--public") { sparams.public_path = argv[++i]; }
209
+ else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
210
  else {
211
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
212
  whisper_print_usage(argc, argv, params, sparams);
 
224
  int progress_prev;
225
  };
226
 
227
+ void check_ffmpeg_availibility() {
228
+ int result = system("ffmpeg -version");
229
+
230
+ if (result == 0) {
231
+ std::cout << "ffmpeg is available." << std::endl;
232
+ } else {
233
+ // ffmpeg is not available
234
+ std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
235
+ std::cout << "and that its executable is included in your system's PATH. ";
236
+ exit(0);
237
+ }
238
+ }
239
+
240
+ bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
241
+ std::ostringstream cmd_stream;
242
+ std::string converted_filename_temp = temp_filename + "_temp.wav";
243
+ cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
244
+ std::string cmd = cmd_stream.str();
245
+
246
+ int status = std::system(cmd.c_str());
247
+ if (status != 0) {
248
+ error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
249
+ return false;
250
+ }
251
+
252
+ // Remove the original file
253
+ if (remove(temp_filename.c_str()) != 0) {
254
+ error_resp = "{\"error\":\"Failed to remove the original file.\"}";
255
+ return false;
256
+ }
257
+
258
+ // Rename the temporary file to match the original filename
259
+ if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
260
+ error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
261
+ return false;
262
+ }
263
+ return true;
264
+ }
265
+
266
  std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
267
  std::string speaker = "";
268
  const int64_t n_samples = pcmf32s[0].size();
 
450
  exit(0);
451
  }
452
 
453
+ if (sparams.ffmpeg_converter) {
454
+ check_ffmpeg_availibility();
455
+ }
456
  // whisper init
457
  struct whisper_context_params cparams;
458
  cparams.use_gpu = params.use_gpu;
 
508
  temp_file << audio_file.content;
509
  temp_file.close();
510
 
511
+ // if file is not wav, convert to wav
512
+
513
+ if (sparams.ffmpeg_converter) {
514
+ std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
515
+ const bool is_converted = convert_to_wav(temp_filename, error_resp);
516
+ if (!is_converted) {
517
+ res.set_content(error_resp, "application/json");
518
+ whisper_mutex.unlock();
519
+ return;
520
+ }
521
+ }
522
+
523
  // read wav content into pcmf32
524
  if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
525
  fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
 
567
 
568
  // run the inference
569
  {
 
570
  printf("Running whisper.cpp inference on %s\n", filename.c_str());
571
  whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
572