ggerganov commited on
Commit
5d1dffc
·
unverified ·
1 Parent(s): 743cace

ggml : do not sched_yield when calling BLAS (llama/4761)

Browse files

* ggml : do not sched_yield when calling BLAS

ggml-ci

* ggml : fix do_yield logic

ggml-ci

* ggml : simplify do_yield logic

ggml-ci

Files changed (1) hide show
  1. ggml.c +14 -27
ggml.c CHANGED
@@ -9704,10 +9704,10 @@ static void ggml_compute_forward_group_norm(
9704
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9705
  // helper function to determine if it is better to use BLAS or not
9706
  // for large matrices, BLAS is faster
9707
- static bool ggml_compute_forward_mul_mat_use_blas(
9708
- const struct ggml_tensor * src0,
9709
- const struct ggml_tensor * src1,
9710
- struct ggml_tensor * dst) {
9711
  //const int64_t ne00 = src0->ne[0];
9712
  //const int64_t ne01 = src0->ne[1];
9713
 
@@ -9787,7 +9787,7 @@ static void ggml_compute_forward_mul_mat(
9787
  #endif
9788
 
9789
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9790
- if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9791
  if (params->ith != 0) {
9792
  return;
9793
  }
@@ -16301,24 +16301,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16301
 
16302
  //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16303
  //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16304
-
16305
- #if defined(GGML_USE_CUBLAS)
16306
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16307
- n_tasks = 1; // TODO: this actually is doing nothing
16308
- // the threads are still spinning
16309
- }
16310
- #elif defined(GGML_USE_CLBLAST)
16311
- if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16312
- n_tasks = 1; // TODO: this actually is doing nothing
16313
- // the threads are still spinning
16314
- }
16315
- #endif
16316
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16317
- if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16318
- n_tasks = 1; // TODO: this actually is doing nothing
16319
- // the threads are still spinning
16320
- }
16321
- #endif
16322
  } break;
16323
  case GGML_OP_MUL_MAT_ID:
16324
  {
@@ -16491,6 +16473,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16491
  state->shared->node_n += 1;
16492
  return (thread_ret_t) GGML_EXIT_ABORTED;
16493
  }
 
16494
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16495
  // all other threads are finished and spinning
16496
  // do finalize and init here so we don't have synchronize again
@@ -16556,14 +16539,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16556
  } else {
16557
  // wait for other threads to finish
16558
  const int last = node_n;
 
 
 
16559
  while (true) {
16560
  // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16561
  // depending on the workload and the operating system.
16562
  // since it is not clear what is the best approach, it should potentially become user-configurable
16563
  // ref: https://github.com/ggerganov/ggml/issues/291
16564
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16565
- sched_yield();
16566
- #endif
 
16567
 
16568
  node_n = atomic_load(&state->shared->node_n);
16569
  if (node_n != last) break;
@@ -16642,7 +16629,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16642
  } else
16643
  #endif
16644
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16645
- if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16646
  if (node->src[0]->type != GGML_TYPE_F32) {
16647
  // here we need memory just for single 2D matrix from src0
16648
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
 
9704
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9705
  // helper function to determine if it is better to use BLAS or not
9706
  // for large matrices, BLAS is faster
9707
+ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
9708
+ const struct ggml_tensor * src0 = dst->src[0];
9709
+ const struct ggml_tensor * src1 = dst->src[1];
9710
+
9711
  //const int64_t ne00 = src0->ne[0];
9712
  //const int64_t ne01 = src0->ne[1];
9713
 
 
9787
  #endif
9788
 
9789
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9790
+ if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9791
  if (params->ith != 0) {
9792
  return;
9793
  }
 
16301
 
16302
  //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16303
  //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16304
  } break;
16305
  case GGML_OP_MUL_MAT_ID:
16306
  {
 
16473
  state->shared->node_n += 1;
16474
  return (thread_ret_t) GGML_EXIT_ABORTED;
16475
  }
16476
+
16477
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16478
  // all other threads are finished and spinning
16479
  // do finalize and init here so we don't have synchronize again
 
16539
  } else {
16540
  // wait for other threads to finish
16541
  const int last = node_n;
16542
+
16543
+ const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
16544
+
16545
  while (true) {
16546
  // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16547
  // depending on the workload and the operating system.
16548
  // since it is not clear what is the best approach, it should potentially become user-configurable
16549
  // ref: https://github.com/ggerganov/ggml/issues/291
16550
+ // UPD: adding the do_yield flag seems to resolve the issue universally
16551
+ if (do_yield) {
16552
+ sched_yield();
16553
+ }
16554
 
16555
  node_n = atomic_load(&state->shared->node_n);
16556
  if (node_n != last) break;
 
16629
  } else
16630
  #endif
16631
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16632
+ if (ggml_compute_forward_mul_mat_use_blas(node)) {
16633
  if (node->src[0]->type != GGML_TYPE_F32) {
16634
  // here we need memory just for single 2D matrix from src0
16635
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);