Spaces:
Running
Running
ggml : do not sched_yield when calling BLAS (llama/4761)
Browse files* ggml : do not sched_yield when calling BLAS
ggml-ci
* ggml : fix do_yield logic
ggml-ci
* ggml : simplify do_yield logic
ggml-ci
ggml.c
CHANGED
|
@@ -9704,10 +9704,10 @@ static void ggml_compute_forward_group_norm(
|
|
| 9704 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 9705 |
// helper function to determine if it is better to use BLAS or not
|
| 9706 |
// for large matrices, BLAS is faster
|
| 9707 |
-
static bool ggml_compute_forward_mul_mat_use_blas(
|
| 9708 |
-
|
| 9709 |
-
|
| 9710 |
-
|
| 9711 |
//const int64_t ne00 = src0->ne[0];
|
| 9712 |
//const int64_t ne01 = src0->ne[1];
|
| 9713 |
|
|
@@ -9787,7 +9787,7 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9787 |
#endif
|
| 9788 |
|
| 9789 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 9790 |
-
if (ggml_compute_forward_mul_mat_use_blas(
|
| 9791 |
if (params->ith != 0) {
|
| 9792 |
return;
|
| 9793 |
}
|
|
@@ -16301,24 +16301,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
| 16301 |
|
| 16302 |
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
| 16303 |
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
| 16304 |
-
|
| 16305 |
-
#if defined(GGML_USE_CUBLAS)
|
| 16306 |
-
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
| 16307 |
-
n_tasks = 1; // TODO: this actually is doing nothing
|
| 16308 |
-
// the threads are still spinning
|
| 16309 |
-
}
|
| 16310 |
-
#elif defined(GGML_USE_CLBLAST)
|
| 16311 |
-
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
| 16312 |
-
n_tasks = 1; // TODO: this actually is doing nothing
|
| 16313 |
-
// the threads are still spinning
|
| 16314 |
-
}
|
| 16315 |
-
#endif
|
| 16316 |
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 16317 |
-
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
| 16318 |
-
n_tasks = 1; // TODO: this actually is doing nothing
|
| 16319 |
-
// the threads are still spinning
|
| 16320 |
-
}
|
| 16321 |
-
#endif
|
| 16322 |
} break;
|
| 16323 |
case GGML_OP_MUL_MAT_ID:
|
| 16324 |
{
|
|
@@ -16491,6 +16473,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 16491 |
state->shared->node_n += 1;
|
| 16492 |
return (thread_ret_t) GGML_EXIT_ABORTED;
|
| 16493 |
}
|
|
|
|
| 16494 |
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
| 16495 |
// all other threads are finished and spinning
|
| 16496 |
// do finalize and init here so we don't have synchronize again
|
|
@@ -16556,14 +16539,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 16556 |
} else {
|
| 16557 |
// wait for other threads to finish
|
| 16558 |
const int last = node_n;
|
|
|
|
|
|
|
|
|
|
| 16559 |
while (true) {
|
| 16560 |
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
| 16561 |
// depending on the workload and the operating system.
|
| 16562 |
// since it is not clear what is the best approach, it should potentially become user-configurable
|
| 16563 |
// ref: https://github.com/ggerganov/ggml/issues/291
|
| 16564 |
-
|
| 16565 |
-
|
| 16566 |
-
|
|
|
|
| 16567 |
|
| 16568 |
node_n = atomic_load(&state->shared->node_n);
|
| 16569 |
if (node_n != last) break;
|
|
@@ -16642,7 +16629,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
| 16642 |
} else
|
| 16643 |
#endif
|
| 16644 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 16645 |
-
if (ggml_compute_forward_mul_mat_use_blas(node
|
| 16646 |
if (node->src[0]->type != GGML_TYPE_F32) {
|
| 16647 |
// here we need memory just for single 2D matrix from src0
|
| 16648 |
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
|
|
|
| 9704 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 9705 |
// helper function to determine if it is better to use BLAS or not
|
| 9706 |
// for large matrices, BLAS is faster
|
| 9707 |
+
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
| 9708 |
+
const struct ggml_tensor * src0 = dst->src[0];
|
| 9709 |
+
const struct ggml_tensor * src1 = dst->src[1];
|
| 9710 |
+
|
| 9711 |
//const int64_t ne00 = src0->ne[0];
|
| 9712 |
//const int64_t ne01 = src0->ne[1];
|
| 9713 |
|
|
|
|
| 9787 |
#endif
|
| 9788 |
|
| 9789 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 9790 |
+
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
| 9791 |
if (params->ith != 0) {
|
| 9792 |
return;
|
| 9793 |
}
|
|
|
|
| 16301 |
|
| 16302 |
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
| 16303 |
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16304 |
} break;
|
| 16305 |
case GGML_OP_MUL_MAT_ID:
|
| 16306 |
{
|
|
|
|
| 16473 |
state->shared->node_n += 1;
|
| 16474 |
return (thread_ret_t) GGML_EXIT_ABORTED;
|
| 16475 |
}
|
| 16476 |
+
|
| 16477 |
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
| 16478 |
// all other threads are finished and spinning
|
| 16479 |
// do finalize and init here so we don't have synchronize again
|
|
|
|
| 16539 |
} else {
|
| 16540 |
// wait for other threads to finish
|
| 16541 |
const int last = node_n;
|
| 16542 |
+
|
| 16543 |
+
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
|
| 16544 |
+
|
| 16545 |
while (true) {
|
| 16546 |
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
| 16547 |
// depending on the workload and the operating system.
|
| 16548 |
// since it is not clear what is the best approach, it should potentially become user-configurable
|
| 16549 |
// ref: https://github.com/ggerganov/ggml/issues/291
|
| 16550 |
+
// UPD: adding the do_yield flag seems to resolve the issue universally
|
| 16551 |
+
if (do_yield) {
|
| 16552 |
+
sched_yield();
|
| 16553 |
+
}
|
| 16554 |
|
| 16555 |
node_n = atomic_load(&state->shared->node_n);
|
| 16556 |
if (node_n != last) break;
|
|
|
|
| 16629 |
} else
|
| 16630 |
#endif
|
| 16631 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 16632 |
+
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
| 16633 |
if (node->src[0]->type != GGML_TYPE_F32) {
|
| 16634 |
// here we need memory just for single 2D matrix from src0
|
| 16635 |
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|