Spaces:

natasa365
/

whisper.cpp

Sleeping

App Files Files Community

Ismatulla Mansurov

ggerganov commited on Nov 27, 2023

Commit

99eeacd

unverified ·

1 Parent(s): ee64ad8

server : automatically convert audio on the server (#1539)

Browse files

* server : automatically convert audio on the server

* server : remove rebundant comments

* server : automatic conversion refactor

* server : update server readme

* server : remove unnecessary comments and tabs

* server : put back remove calling

* server : apply suggestions from code review

Co-authored-by: Georgi Gerganov <[email protected]>

* server : check ffmpeg before the server lunch

* server : fix indentation

* Apply suggestions from code review

Co-authored-by: Georgi Gerganov <[email protected]>

* server : fix function typo calling

* server : fix function typo calling

* server : add warning in readme

---------

Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (2) hide show

examples/server/README.md +4 -0
examples/server/server.cpp +58 -1

examples/server/README.md CHANGED Viewed

@@ -43,8 +43,12 @@ options:
   -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
   --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
   --port PORT,                   [8080   ] Port number for the server
 ```
 ## request examples
 **/inference**

   -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
   --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
   --port PORT,                   [8080   ] Port number for the server
+  --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
 ```
+> [!WARNING]
+> **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**
 ## request examples
 **/inference**

examples/server/server.cpp CHANGED Viewed

@@ -43,6 +43,8 @@ struct server_params
     int32_t port          = 8080;
     int32_t read_timeout  = 600;
     int32_t write_timeout = 600;
 };
 struct whisper_params {
@@ -157,6 +159,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  --host HOST,                   [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
     fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
     fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
     fprintf(stderr, "\n");
 }
@@ -203,6 +206,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
         else if (                  arg == "--port")            { sparams.port        = std::stoi(argv[++i]); }
         else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
         else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             whisper_print_usage(argc, argv, params, sparams);
@@ -220,6 +224,45 @@ struct whisper_print_user_data {
     int progress_prev;
 };
 std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
     std::string speaker = "";
     const int64_t n_samples = pcmf32s[0].size();
@@ -407,6 +450,9 @@ int main(int argc, char ** argv) {
         exit(0);
     }
     // whisper init
     struct whisper_context_params cparams;
     cparams.use_gpu = params.use_gpu;
@@ -462,6 +508,18 @@ int main(int argc, char ** argv) {
         temp_file << audio_file.content;
         temp_file.close();
         // read wav content into pcmf32
         if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
             fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
@@ -509,7 +567,6 @@ int main(int argc, char ** argv) {
         // run the inference
         {
             printf("Running whisper.cpp inference on %s\n", filename.c_str());
             whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

     int32_t port          = 8080;
     int32_t read_timeout  = 600;
     int32_t write_timeout = 600;
+    bool ffmpeg_converter = false;
 };
 struct whisper_params {
     fprintf(stderr, "  --host HOST,                   [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
     fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
     fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
+    fprintf(stderr, "  --convert,                     [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
     fprintf(stderr, "\n");
 }
         else if (                  arg == "--port")            { sparams.port        = std::stoi(argv[++i]); }
         else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
         else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
+        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             whisper_print_usage(argc, argv, params, sparams);
     int progress_prev;
 };
+void check_ffmpeg_availibility() {
+    int result = system("ffmpeg -version");
+    if (result == 0) {
+        std::cout << "ffmpeg is available." << std::endl;
+    } else {
+        // ffmpeg is not available
+        std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
+        std::cout << "and that its executable is included in your system's PATH. ";
+        exit(0);
+    }
+}
+bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
+    std::ostringstream cmd_stream;
+    std::string converted_filename_temp = temp_filename + "_temp.wav";
+    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
+    std::string cmd = cmd_stream.str();
+    int status = std::system(cmd.c_str());
+    if (status != 0) {
+        error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
+        return false;
+    }
+    // Remove the original file
+    if (remove(temp_filename.c_str()) != 0) {
+        error_resp = "{\"error\":\"Failed to remove the original file.\"}";
+        return false;
+    }
+    // Rename the temporary file to match the original filename
+    if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
+        error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
+        return false;
+    }
+    return true;
+}
 std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
     std::string speaker = "";
     const int64_t n_samples = pcmf32s[0].size();
         exit(0);
     }
+    if (sparams.ffmpeg_converter) {
+        check_ffmpeg_availibility();
+    }
     // whisper init
     struct whisper_context_params cparams;
     cparams.use_gpu = params.use_gpu;
         temp_file << audio_file.content;
         temp_file.close();
+        // if file is not wav, convert to wav
+        if (sparams.ffmpeg_converter) {
+            std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
+            const bool is_converted = convert_to_wav(temp_filename, error_resp);
+            if (!is_converted) {
+                res.set_content(error_resp, "application/json");
+                whisper_mutex.unlock();
+                return;
+            }
+        }
         // read wav content into pcmf32
         if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
             fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
         // run the inference
         {
             printf("Running whisper.cpp inference on %s\n", filename.c_str());
             whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);