Masaya, Kato slaren ggerganov commited on
Commit
7e5d850
·
1 Parent(s): ad9ee26

ggml : use OpenMP as a thread pool (llama/7606)

Browse files

* ggml: Added OpenMP for multi-threads processing

* ggml : Limit the number of threads used to avoid deadlock

* update shared state n_threads in parallel region

* clear numa affinity for main thread even with openmp

* enable openmp by default

* fix msvc build

* disable openmp on macos

* ci : disable openmp with thread sanitizer

* Update ggml.c

Co-authored-by: Georgi Gerganov <[email protected]>

---------

Co-authored-by: slaren <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (1) hide show
  1. ggml.c +73 -38
ggml.c CHANGED
@@ -5,6 +5,7 @@
5
  #include "ggml-quants.h"
6
  #include "ggml.h"
7
 
 
8
  #if defined(_MSC_VER) || defined(__MINGW32__)
9
  #include <malloc.h> // using malloc.h with MSC/MINGW
10
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -28,6 +29,10 @@
28
  #include <syscall.h>
29
  #endif
30
 
 
 
 
 
31
  #ifdef GGML_USE_METAL
32
  #include <unistd.h>
33
  #endif
@@ -1756,7 +1761,7 @@ struct ggml_compute_state_shared {
1756
  int64_t perf_node_start_cycles;
1757
  int64_t perf_node_start_time_us;
1758
 
1759
- const int n_threads;
1760
 
1761
  // synchronization primitives
1762
  atomic_int n_active; // num active threads
@@ -19670,6 +19675,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19670
  return cplan;
19671
  }
19672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19673
  enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
19674
  {
19675
  GGML_ASSERT(cplan);
@@ -19680,7 +19738,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19680
  }
19681
  }
19682
 
19683
- const int n_threads = cplan->n_threads;
 
 
 
 
19684
 
19685
  struct ggml_compute_state_shared state_shared = {
19686
  /*.cgraph =*/ cgraph,
@@ -19696,47 +19758,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19696
  /*.current_chunk; =*/ 0,
19697
  };
19698
  struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
19699
-
19700
- // create thread pool
19701
- if (n_threads > 1) {
19702
- for (int j = 1; j < n_threads; ++j) {
19703
- workers[j] = (struct ggml_compute_state) {
19704
- .thrd = 0,
19705
- .ith = j,
19706
- .shared = &state_shared,
19707
- .ec = GGML_STATUS_SUCCESS,
19708
- };
19709
-
19710
- const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19711
- GGML_ASSERT(rc == 0);
19712
- UNUSED(rc);
19713
- }
19714
- }
19715
-
19716
- workers[0].ith = 0;
19717
- workers[0].shared = &state_shared;
19718
- workers[0].ec = GGML_STATUS_SUCCESS;
19719
-
19720
  const int64_t perf_start_cycles = ggml_perf_cycles();
19721
  const int64_t perf_start_time_us = ggml_perf_time_us();
19722
 
19723
- // this is a work thread too
19724
- ggml_graph_compute_thread(&workers[0]);
19725
- enum ggml_status compute_status = workers[0].ec;
19726
-
19727
- // don't leave affinity set on the main thread
19728
- clear_numa_thread_affinity();
19729
-
19730
- // join or kill thread pool
19731
- if (n_threads > 1) {
19732
- for (int j = 1; j < n_threads; j++) {
19733
- const int rc = ggml_thread_join(workers[j].thrd, NULL);
19734
- GGML_ASSERT(rc == 0);
19735
- if (workers[j].ec != GGML_STATUS_SUCCESS)
19736
- compute_status = workers[j].ec;
19737
- }
19738
  }
19739
 
 
 
19740
  // performance stats (graph)
19741
  {
19742
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
 
5
  #include "ggml-quants.h"
6
  #include "ggml.h"
7
 
8
+
9
  #if defined(_MSC_VER) || defined(__MINGW32__)
10
  #include <malloc.h> // using malloc.h with MSC/MINGW
11
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
 
29
  #include <syscall.h>
30
  #endif
31
 
32
+ #ifdef GGML_USE_OPENMP
33
+ #include <omp.h>
34
+ #endif
35
+
36
  #ifdef GGML_USE_METAL
37
  #include <unistd.h>
38
  #endif
 
1761
  int64_t perf_node_start_cycles;
1762
  int64_t perf_node_start_time_us;
1763
 
1764
+ int n_threads;
1765
 
1766
  // synchronization primitives
1767
  atomic_int n_active; // num active threads
 
19675
  return cplan;
19676
  }
19677
 
19678
+ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
19679
+ enum ggml_status compute_status = GGML_STATUS_SUCCESS;
19680
+
19681
+ #ifdef GGML_USE_OPENMP
19682
+ if (n_threads > 1) {
19683
+ #pragma omp parallel num_threads(n_threads)
19684
+ {
19685
+ #pragma omp single
19686
+ {
19687
+ // update the number of threads from the actual number of threads that we got from OpenMP
19688
+ n_threads = omp_get_num_threads();
19689
+ workers[0].shared->n_threads = n_threads;
19690
+ workers[0].shared->n_active = n_threads;
19691
+ }
19692
+ ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
19693
+ }
19694
+ } else {
19695
+ ggml_graph_compute_thread(&workers[0]);
19696
+ }
19697
+ #else
19698
+ // create thread pool
19699
+ if (n_threads > 1) {
19700
+ for (int j = 1; j < n_threads; ++j) {
19701
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19702
+ GGML_ASSERT(rc == 0);
19703
+ UNUSED(rc);
19704
+ }
19705
+ }
19706
+
19707
+ // this is a work thread too
19708
+ ggml_graph_compute_thread(&workers[0]);
19709
+
19710
+ // join or kill thread pool
19711
+ if (n_threads > 1) {
19712
+ for (int j = 1; j < n_threads; j++) {
19713
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
19714
+ GGML_ASSERT(rc == 0);
19715
+ UNUSED(rc);
19716
+ }
19717
+ }
19718
+ #endif
19719
+ // don't leave affinity set on the main thread
19720
+ clear_numa_thread_affinity();
19721
+
19722
+ for (int j = 0; j < n_threads; j++) {
19723
+ if (workers[j].ec != GGML_STATUS_SUCCESS) {
19724
+ compute_status = workers[j].ec;
19725
+ break;
19726
+ }
19727
+ }
19728
+ return compute_status;
19729
+ }
19730
+
19731
  enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
19732
  {
19733
  GGML_ASSERT(cplan);
 
19738
  }
19739
  }
19740
 
19741
+ int n_threads = cplan->n_threads;
19742
+
19743
+ #if defined(GGML_USE_OPENMP)
19744
+ n_threads = MIN(n_threads, omp_get_max_threads());
19745
+ #endif
19746
 
19747
  struct ggml_compute_state_shared state_shared = {
19748
  /*.cgraph =*/ cgraph,
 
19758
  /*.current_chunk; =*/ 0,
19759
  };
19760
  struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19761
  const int64_t perf_start_cycles = ggml_perf_cycles();
19762
  const int64_t perf_start_time_us = ggml_perf_time_us();
19763
 
19764
+ for (int j = 0; j < n_threads; ++j) {
19765
+ workers[j] = (struct ggml_compute_state) {
19766
+ .thrd = 0,
19767
+ .ith = j,
19768
+ .shared = &state_shared,
19769
+ .ec = GGML_STATUS_SUCCESS,
19770
+ };
 
 
 
 
 
 
 
 
19771
  }
19772
 
19773
+ enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
19774
+
19775
  // performance stats (graph)
19776
  {
19777
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;