Spaces:
Running
Running
ggml : add error handling to graph_compute (#1714)
Browse files- bindings/ruby/ext/ggml-backend-impl.h +1 -1
- bindings/ruby/ext/ggml-backend.c +2 -2
- bindings/ruby/ext/ggml-backend.h +1 -1
- ggml-backend-impl.h +1 -1
- ggml-backend.c +7 -3
- ggml-backend.h +1 -1
- ggml-cuda.cu +3 -1
- ggml-metal.h +1 -1
- ggml-metal.m +5 -4
- whisper.cpp +16 -8
bindings/ruby/ext/ggml-backend-impl.h
CHANGED
|
@@ -70,7 +70,7 @@ extern "C" {
|
|
| 70 |
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 71 |
|
| 72 |
// compute graph without a plan
|
| 73 |
-
|
| 74 |
|
| 75 |
// check if the backend supports an operation
|
| 76 |
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
|
|
| 70 |
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 71 |
|
| 72 |
// compute graph without a plan
|
| 73 |
+
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 74 |
|
| 75 |
// check if the backend supports an operation
|
| 76 |
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
bindings/ruby/ext/ggml-backend.c
CHANGED
|
@@ -156,8 +156,8 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
|
|
| 156 |
backend->iface.graph_plan_compute(backend, plan);
|
| 157 |
}
|
| 158 |
|
| 159 |
-
|
| 160 |
-
backend->iface.graph_compute(backend, cgraph);
|
| 161 |
}
|
| 162 |
|
| 163 |
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
|
|
| 156 |
backend->iface.graph_plan_compute(backend, plan);
|
| 157 |
}
|
| 158 |
|
| 159 |
+
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 160 |
+
return backend->iface.graph_compute(backend, cgraph);
|
| 161 |
}
|
| 162 |
|
| 163 |
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
bindings/ruby/ext/ggml-backend.h
CHANGED
|
@@ -52,7 +52,7 @@ extern "C" {
|
|
| 52 |
|
| 53 |
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 54 |
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 55 |
-
GGML_API
|
| 56 |
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
| 57 |
|
| 58 |
// tensor copy between different backends
|
|
|
|
| 52 |
|
| 53 |
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 54 |
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 55 |
+
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 56 |
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
| 57 |
|
| 58 |
// tensor copy between different backends
|
ggml-backend-impl.h
CHANGED
|
@@ -90,7 +90,7 @@ extern "C" {
|
|
| 90 |
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 91 |
|
| 92 |
// compute graph without a plan
|
| 93 |
-
|
| 94 |
|
| 95 |
// check if the backend supports an operation
|
| 96 |
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
|
|
| 90 |
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 91 |
|
| 92 |
// compute graph without a plan
|
| 93 |
+
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 94 |
|
| 95 |
// check if the backend supports an operation
|
| 96 |
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
ggml-backend.c
CHANGED
|
@@ -195,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
|
|
| 195 |
ggml_backend_synchronize(backend);
|
| 196 |
}
|
| 197 |
|
| 198 |
-
|
| 199 |
-
backend->iface.graph_compute(backend, cgraph)
|
|
|
|
|
|
|
| 200 |
|
| 201 |
// TODO: optional sync
|
| 202 |
ggml_backend_synchronize(backend);
|
|
|
|
| 203 |
}
|
| 204 |
|
| 205 |
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
@@ -597,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|
| 597 |
GGML_UNUSED(backend);
|
| 598 |
}
|
| 599 |
|
| 600 |
-
static
|
| 601 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 602 |
|
| 603 |
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
@@ -611,6 +614,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
| 611 |
cplan.work_data = cpu_ctx->work_data;
|
| 612 |
|
| 613 |
ggml_graph_compute(cgraph, &cplan);
|
|
|
|
| 614 |
}
|
| 615 |
|
| 616 |
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
|
|
| 195 |
ggml_backend_synchronize(backend);
|
| 196 |
}
|
| 197 |
|
| 198 |
+
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 199 |
+
if (!backend->iface.graph_compute(backend, cgraph)) {
|
| 200 |
+
return false;
|
| 201 |
+
}
|
| 202 |
|
| 203 |
// TODO: optional sync
|
| 204 |
ggml_backend_synchronize(backend);
|
| 205 |
+
return true;
|
| 206 |
}
|
| 207 |
|
| 208 |
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
|
|
| 600 |
GGML_UNUSED(backend);
|
| 601 |
}
|
| 602 |
|
| 603 |
+
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 604 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 605 |
|
| 606 |
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
|
|
| 614 |
cplan.work_data = cpu_ctx->work_data;
|
| 615 |
|
| 616 |
ggml_graph_compute(cgraph, &cplan);
|
| 617 |
+
return true;
|
| 618 |
}
|
| 619 |
|
| 620 |
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
ggml-backend.h
CHANGED
|
@@ -58,7 +58,7 @@ extern "C" {
|
|
| 58 |
|
| 59 |
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 60 |
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 61 |
-
GGML_API
|
| 62 |
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
| 63 |
|
| 64 |
// tensor copy between different backends
|
|
|
|
| 58 |
|
| 59 |
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 60 |
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 61 |
+
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 62 |
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
| 63 |
|
| 64 |
// tensor copy between different backends
|
ggml-cuda.cu
CHANGED
|
@@ -9910,7 +9910,7 @@ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
|
| 9910 |
UNUSED(plan);
|
| 9911 |
}
|
| 9912 |
|
| 9913 |
-
static
|
| 9914 |
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
| 9915 |
|
| 9916 |
ggml_cuda_set_main_device(cuda_ctx->device);
|
|
@@ -9967,6 +9967,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
| 9967 |
}
|
| 9968 |
|
| 9969 |
UNUSED(backend);
|
|
|
|
|
|
|
| 9970 |
}
|
| 9971 |
|
| 9972 |
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
|
|
| 9910 |
UNUSED(plan);
|
| 9911 |
}
|
| 9912 |
|
| 9913 |
+
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 9914 |
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
| 9915 |
|
| 9916 |
ggml_cuda_set_main_device(cuda_ctx->device);
|
|
|
|
| 9967 |
}
|
| 9968 |
|
| 9969 |
UNUSED(backend);
|
| 9970 |
+
|
| 9971 |
+
return true;
|
| 9972 |
}
|
| 9973 |
|
| 9974 |
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
ggml-metal.h
CHANGED
|
@@ -87,7 +87,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
|
| 87 |
|
| 88 |
// same as ggml_graph_compute but uses Metal
|
| 89 |
// creates gf->n_threads command buffers in parallel
|
| 90 |
-
|
| 91 |
|
| 92 |
//
|
| 93 |
// backend API
|
|
|
|
| 87 |
|
| 88 |
// same as ggml_graph_compute but uses Metal
|
| 89 |
// creates gf->n_threads command buffers in parallel
|
| 90 |
+
bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
| 91 |
|
| 92 |
//
|
| 93 |
// backend API
|
ggml-metal.m
CHANGED
|
@@ -977,7 +977,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
|
|
| 977 |
return false;
|
| 978 |
}
|
| 979 |
}
|
| 980 |
-
|
| 981 |
struct ggml_metal_context * ctx,
|
| 982 |
struct ggml_cgraph * gf) {
|
| 983 |
@autoreleasepool {
|
|
@@ -2405,10 +2405,11 @@ void ggml_metal_graph_compute(
|
|
| 2405 |
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
| 2406 |
if (status != MTLCommandBufferStatusCompleted) {
|
| 2407 |
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
| 2408 |
-
|
| 2409 |
}
|
| 2410 |
}
|
| 2411 |
|
|
|
|
| 2412 |
}
|
| 2413 |
}
|
| 2414 |
|
|
@@ -2688,10 +2689,10 @@ static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggm
|
|
| 2688 |
UNUSED(backend);
|
| 2689 |
}
|
| 2690 |
|
| 2691 |
-
static
|
| 2692 |
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
| 2693 |
|
| 2694 |
-
ggml_metal_graph_compute(metal_ctx, cgraph);
|
| 2695 |
}
|
| 2696 |
|
| 2697 |
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
|
|
| 977 |
return false;
|
| 978 |
}
|
| 979 |
}
|
| 980 |
+
bool ggml_metal_graph_compute(
|
| 981 |
struct ggml_metal_context * ctx,
|
| 982 |
struct ggml_cgraph * gf) {
|
| 983 |
@autoreleasepool {
|
|
|
|
| 2405 |
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
| 2406 |
if (status != MTLCommandBufferStatusCompleted) {
|
| 2407 |
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
| 2408 |
+
return false;
|
| 2409 |
}
|
| 2410 |
}
|
| 2411 |
|
| 2412 |
+
return true;
|
| 2413 |
}
|
| 2414 |
}
|
| 2415 |
|
|
|
|
| 2689 |
UNUSED(backend);
|
| 2690 |
}
|
| 2691 |
|
| 2692 |
+
static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 2693 |
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
| 2694 |
|
| 2695 |
+
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
| 2696 |
}
|
| 2697 |
|
| 2698 |
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
whisper.cpp
CHANGED
|
@@ -152,7 +152,7 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
|
|
| 152 |
// ggml helpers
|
| 153 |
//
|
| 154 |
|
| 155 |
-
static
|
| 156 |
struct ggml_cgraph * graph,
|
| 157 |
std::vector<uint8_t> & buf,
|
| 158 |
int n_threads,
|
|
@@ -168,10 +168,10 @@ static void ggml_graph_compute_helper(
|
|
| 168 |
plan.work_data = buf.data();
|
| 169 |
}
|
| 170 |
|
| 171 |
-
ggml_graph_compute(graph, &plan);
|
| 172 |
}
|
| 173 |
|
| 174 |
-
static
|
| 175 |
struct ggml_backend * backend,
|
| 176 |
struct ggml_cgraph * graph,
|
| 177 |
int n_threads) {
|
|
@@ -183,7 +183,7 @@ static void ggml_graph_compute_helper(
|
|
| 183 |
ggml_backend_metal_set_n_cb(backend, n_threads);
|
| 184 |
}
|
| 185 |
#endif
|
| 186 |
-
ggml_backend_graph_compute(backend, graph);
|
| 187 |
}
|
| 188 |
|
| 189 |
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
|
@@ -2103,7 +2103,9 @@ static bool whisper_encode_internal(
|
|
| 2103 |
ggml_allocr_alloc_graph(alloc, gf);
|
| 2104 |
|
| 2105 |
if (!whisper_encode_external(wstate)) {
|
| 2106 |
-
ggml_graph_compute_helper(wstate.backend, gf, n_threads)
|
|
|
|
|
|
|
| 2107 |
}
|
| 2108 |
}
|
| 2109 |
|
|
@@ -2117,7 +2119,9 @@ static bool whisper_encode_internal(
|
|
| 2117 |
|
| 2118 |
ggml_allocr_alloc_graph(alloc, gf);
|
| 2119 |
|
| 2120 |
-
ggml_graph_compute_helper(wstate.backend, gf, n_threads)
|
|
|
|
|
|
|
| 2121 |
}
|
| 2122 |
|
| 2123 |
// cross
|
|
@@ -2130,7 +2134,9 @@ static bool whisper_encode_internal(
|
|
| 2130 |
|
| 2131 |
ggml_allocr_alloc_graph(alloc, gf);
|
| 2132 |
|
| 2133 |
-
ggml_graph_compute_helper(wstate.backend, gf, n_threads)
|
|
|
|
|
|
|
| 2134 |
}
|
| 2135 |
|
| 2136 |
wstate.t_encode_us += ggml_time_us() - t_start_us;
|
|
@@ -2552,7 +2558,9 @@ static bool whisper_decode_internal(
|
|
| 2552 |
|
| 2553 |
logits = gf->nodes[gf->n_nodes - 1];
|
| 2554 |
|
| 2555 |
-
ggml_graph_compute_helper(wstate.backend, gf, n_threads)
|
|
|
|
|
|
|
| 2556 |
}
|
| 2557 |
|
| 2558 |
logits_out.resize(n_tokens*n_vocab);
|
|
|
|
| 152 |
// ggml helpers
|
| 153 |
//
|
| 154 |
|
| 155 |
+
static bool ggml_graph_compute_helper(
|
| 156 |
struct ggml_cgraph * graph,
|
| 157 |
std::vector<uint8_t> & buf,
|
| 158 |
int n_threads,
|
|
|
|
| 168 |
plan.work_data = buf.data();
|
| 169 |
}
|
| 170 |
|
| 171 |
+
return ggml_graph_compute(graph, &plan);
|
| 172 |
}
|
| 173 |
|
| 174 |
+
static bool ggml_graph_compute_helper(
|
| 175 |
struct ggml_backend * backend,
|
| 176 |
struct ggml_cgraph * graph,
|
| 177 |
int n_threads) {
|
|
|
|
| 183 |
ggml_backend_metal_set_n_cb(backend, n_threads);
|
| 184 |
}
|
| 185 |
#endif
|
| 186 |
+
return ggml_backend_graph_compute(backend, graph);
|
| 187 |
}
|
| 188 |
|
| 189 |
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
|
|
|
| 2103 |
ggml_allocr_alloc_graph(alloc, gf);
|
| 2104 |
|
| 2105 |
if (!whisper_encode_external(wstate)) {
|
| 2106 |
+
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
| 2107 |
+
return false;
|
| 2108 |
+
}
|
| 2109 |
}
|
| 2110 |
}
|
| 2111 |
|
|
|
|
| 2119 |
|
| 2120 |
ggml_allocr_alloc_graph(alloc, gf);
|
| 2121 |
|
| 2122 |
+
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
| 2123 |
+
return false;
|
| 2124 |
+
}
|
| 2125 |
}
|
| 2126 |
|
| 2127 |
// cross
|
|
|
|
| 2134 |
|
| 2135 |
ggml_allocr_alloc_graph(alloc, gf);
|
| 2136 |
|
| 2137 |
+
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
| 2138 |
+
return false;
|
| 2139 |
+
}
|
| 2140 |
}
|
| 2141 |
|
| 2142 |
wstate.t_encode_us += ggml_time_us() - t_start_us;
|
|
|
|
| 2558 |
|
| 2559 |
logits = gf->nodes[gf->n_nodes - 1];
|
| 2560 |
|
| 2561 |
+
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
| 2562 |
+
return false;
|
| 2563 |
+
}
|
| 2564 |
}
|
| 2565 |
|
| 2566 |
logits_out.resize(n_tokens*n_vocab);
|