Spaces:

natasa365
/

whisper.cpp

Running

App Files Files Community

Michael Podvitskiy commited on Feb 9, 2024

Commit

a8ea91b

unverified ·

1 Parent(s): d99e873

ggml : add abort_callback for cpu backend (ggml/725)

Browse files

* a way to use abort_callback with the cpu backend

* whisper update

Files changed (6) hide show

ggml-backend.c +22 -4
ggml-backend.h +3 -2
ggml.c +1 -1
ggml.h +7 -2
whisper.cpp +4 -4
whisper.h +1 -6

ggml-backend.c CHANGED Viewed

@@ -653,6 +653,9 @@ struct ggml_backend_cpu_context {
     int n_threads;
     void * work_data;
     size_t work_size;
 };
 GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
@@ -691,6 +694,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
         cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
     }
     return cpu_plan;
 }
@@ -721,9 +727,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
         cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
         cpu_ctx->work_size = cplan.work_size;
     }
     cplan.work_data = cpu_ctx->work_data;
     ggml_graph_compute(cgraph, &cplan);
     return true;
 }
@@ -759,9 +767,11 @@ static struct ggml_backend_i cpu_backend_i = {
 ggml_backend_t ggml_backend_cpu_init(void) {
     struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
-    ctx->n_threads = GGML_DEFAULT_N_THREADS;
-    ctx->work_data = NULL;
-    ctx->work_size = 0;
     ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
@@ -783,6 +793,14 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
 }

     int n_threads;
     void * work_data;
     size_t work_size;
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
 };
 GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
         cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
     }
+    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
     return cpu_plan;
 }
         cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
         cpu_ctx->work_size = cplan.work_size;
     }
     cplan.work_data = cpu_ctx->work_data;
+    cplan.abort_callback      = cpu_ctx->abort_callback;
+    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
     ggml_graph_compute(cgraph, &cplan);
     return true;
 }
 ggml_backend_t ggml_backend_cpu_init(void) {
     struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->work_data           = NULL;
+    ctx->work_size           = 0;
+    ctx->abort_callback      = NULL;
+    ctx->abort_callback_data = NULL;
     ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
     ctx->n_threads = n_threads;
 }
+void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+}
 GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
 }

ggml-backend.h CHANGED Viewed

@@ -83,8 +83,9 @@ extern "C" {
     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-    GGML_API GGML_CALL bool ggml_backend_is_cpu           (ggml_backend_t backend);
-    GGML_API           void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
     // Create a backend buffer from an existing pointer
     GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);

     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
     // Create a backend buffer from an existing pointer
     GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);

ggml.c CHANGED Viewed

@@ -16560,7 +16560,7 @@ struct ggml_compute_state_shared {
     atomic_int node_n;    // active graph node
     atomic_int node_task; // active graph node task phase
-    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
     void * abort_callback_data;
 };

     atomic_int node_n;    // active graph node
     atomic_int node_task; // active graph node task phase
+    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
     void * abort_callback_data;
 };

ggml.h CHANGED Viewed

@@ -567,6 +567,11 @@ extern "C" {
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
     struct ggml_cplan {
@@ -576,8 +581,8 @@ extern "C" {
         int n_threads;
         // abort ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
-        void * abort_callback_data;
     };
     enum ggml_cgraph_eval_order {

     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
+    // Abort callback
+    // If not NULL, called before ggml computation
+    // If it returns true, the computation is aborted
+    typedef bool (*ggml_abort_callback)(void * data);
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
     struct ggml_cplan {
         int n_threads;
         // abort ggml_graph_compute when true
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
     };
     enum ggml_cgraph_eval_order {

whisper.cpp CHANGED Viewed

@@ -156,11 +156,11 @@ static bool ggml_graph_compute_helper(
           struct ggml_cgraph * graph,
         std::vector<uint8_t> & buf,
                          int   n_threads,
-      whisper_abort_callback   abort_callback,
                         void * abort_callback_data) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-    plan.abort_callback = abort_callback;
     plan.abort_callback_data = abort_callback_data;
     if (plan.work_size > 0) {
@@ -2130,7 +2130,7 @@ static bool whisper_encode_internal(
           whisper_state & wstate,
               const int   mel_offset,
               const int   n_threads,
- whisper_abort_callback   abort_callback,
                    void * abort_callback_data) {
     const int64_t t_start_us = ggml_time_us();
@@ -2561,7 +2561,7 @@ static bool whisper_decode_internal(
           whisper_state & wstate,
     const whisper_batch & batch,
               const int   n_threads,
- whisper_abort_callback   abort_callback,
                    void * abort_callback_data) {
     const int64_t t_start_us = ggml_time_us();

           struct ggml_cgraph * graph,
         std::vector<uint8_t> & buf,
                          int   n_threads,
+         ggml_abort_callback   abort_callback,
                         void * abort_callback_data) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    plan.abort_callback      = abort_callback;
     plan.abort_callback_data = abort_callback_data;
     if (plan.work_size > 0) {
           whisper_state & wstate,
               const int   mel_offset,
               const int   n_threads,
+    ggml_abort_callback   abort_callback,
                    void * abort_callback_data) {
     const int64_t t_start_us = ggml_time_us();
           whisper_state & wstate,
     const whisper_batch & batch,
               const int   n_threads,
+    ggml_abort_callback   abort_callback,
                    void * abort_callback_data) {
     const int64_t t_start_us = ggml_time_us();

whisper.h CHANGED Viewed

@@ -412,11 +412,6 @@ extern "C" {
     // If it returns false, the computation is aborted
     typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
-    // Abort callback
-    // If not NULL, called before ggml computation
-    // If it returns true, the computation is aborted
-    typedef bool (*whisper_abort_callback)(void * user_data);
     // Logits filter callback
     // Can be used to modify the logits before sampling
     // If not NULL, called after applying temperature to logits
@@ -513,7 +508,7 @@ extern "C" {
         void * encoder_begin_callback_user_data;
         // called each time before ggml computation starts
-        whisper_abort_callback abort_callback;
         void * abort_callback_user_data;
         // called by each decoder to filter obtained logits

     // If it returns false, the computation is aborted
     typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
     // Logits filter callback
     // Can be used to modify the logits before sampling
     // If not NULL, called after applying temperature to logits
         void * encoder_begin_callback_user_data;
         // called each time before ggml computation starts
+        ggml_abort_callback abort_callback;
         void * abort_callback_user_data;
         // called by each decoder to filter obtained logits