Spaces:
Running
Running
ggml : use OpenMP as a thread pool (llama/7606)
Browse files* ggml: Added OpenMP for multi-threads processing
* ggml : Limit the number of threads used to avoid deadlock
* update shared state n_threads in parallel region
* clear numa affinity for main thread even with openmp
* enable openmp by default
* fix msvc build
* disable openmp on macos
* ci : disable openmp with thread sanitizer
* Update ggml.c
Co-authored-by: Georgi Gerganov <[email protected]>
---------
Co-authored-by: slaren <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>
ggml.c
CHANGED
|
@@ -5,6 +5,7 @@
|
|
| 5 |
#include "ggml-quants.h"
|
| 6 |
#include "ggml.h"
|
| 7 |
|
|
|
|
| 8 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 9 |
#include <malloc.h> // using malloc.h with MSC/MINGW
|
| 10 |
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
|
@@ -28,6 +29,10 @@
|
|
| 28 |
#include <syscall.h>
|
| 29 |
#endif
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
#ifdef GGML_USE_METAL
|
| 32 |
#include <unistd.h>
|
| 33 |
#endif
|
|
@@ -1756,7 +1761,7 @@ struct ggml_compute_state_shared {
|
|
| 1756 |
int64_t perf_node_start_cycles;
|
| 1757 |
int64_t perf_node_start_time_us;
|
| 1758 |
|
| 1759 |
-
|
| 1760 |
|
| 1761 |
// synchronization primitives
|
| 1762 |
atomic_int n_active; // num active threads
|
|
@@ -19670,6 +19675,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
| 19670 |
return cplan;
|
| 19671 |
}
|
| 19672 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19673 |
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
| 19674 |
{
|
| 19675 |
GGML_ASSERT(cplan);
|
|
@@ -19680,7 +19738,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
| 19680 |
}
|
| 19681 |
}
|
| 19682 |
|
| 19683 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19684 |
|
| 19685 |
struct ggml_compute_state_shared state_shared = {
|
| 19686 |
/*.cgraph =*/ cgraph,
|
|
@@ -19696,47 +19758,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
| 19696 |
/*.current_chunk; =*/ 0,
|
| 19697 |
};
|
| 19698 |
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
| 19699 |
-
|
| 19700 |
-
// create thread pool
|
| 19701 |
-
if (n_threads > 1) {
|
| 19702 |
-
for (int j = 1; j < n_threads; ++j) {
|
| 19703 |
-
workers[j] = (struct ggml_compute_state) {
|
| 19704 |
-
.thrd = 0,
|
| 19705 |
-
.ith = j,
|
| 19706 |
-
.shared = &state_shared,
|
| 19707 |
-
.ec = GGML_STATUS_SUCCESS,
|
| 19708 |
-
};
|
| 19709 |
-
|
| 19710 |
-
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
| 19711 |
-
GGML_ASSERT(rc == 0);
|
| 19712 |
-
UNUSED(rc);
|
| 19713 |
-
}
|
| 19714 |
-
}
|
| 19715 |
-
|
| 19716 |
-
workers[0].ith = 0;
|
| 19717 |
-
workers[0].shared = &state_shared;
|
| 19718 |
-
workers[0].ec = GGML_STATUS_SUCCESS;
|
| 19719 |
-
|
| 19720 |
const int64_t perf_start_cycles = ggml_perf_cycles();
|
| 19721 |
const int64_t perf_start_time_us = ggml_perf_time_us();
|
| 19722 |
|
| 19723 |
-
|
| 19724 |
-
|
| 19725 |
-
|
| 19726 |
-
|
| 19727 |
-
|
| 19728 |
-
|
| 19729 |
-
|
| 19730 |
-
// join or kill thread pool
|
| 19731 |
-
if (n_threads > 1) {
|
| 19732 |
-
for (int j = 1; j < n_threads; j++) {
|
| 19733 |
-
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
| 19734 |
-
GGML_ASSERT(rc == 0);
|
| 19735 |
-
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
| 19736 |
-
compute_status = workers[j].ec;
|
| 19737 |
-
}
|
| 19738 |
}
|
| 19739 |
|
|
|
|
|
|
|
| 19740 |
// performance stats (graph)
|
| 19741 |
{
|
| 19742 |
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
|
|
|
| 5 |
#include "ggml-quants.h"
|
| 6 |
#include "ggml.h"
|
| 7 |
|
| 8 |
+
|
| 9 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 10 |
#include <malloc.h> // using malloc.h with MSC/MINGW
|
| 11 |
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
|
|
|
| 29 |
#include <syscall.h>
|
| 30 |
#endif
|
| 31 |
|
| 32 |
+
#ifdef GGML_USE_OPENMP
|
| 33 |
+
#include <omp.h>
|
| 34 |
+
#endif
|
| 35 |
+
|
| 36 |
#ifdef GGML_USE_METAL
|
| 37 |
#include <unistd.h>
|
| 38 |
#endif
|
|
|
|
| 1761 |
int64_t perf_node_start_cycles;
|
| 1762 |
int64_t perf_node_start_time_us;
|
| 1763 |
|
| 1764 |
+
int n_threads;
|
| 1765 |
|
| 1766 |
// synchronization primitives
|
| 1767 |
atomic_int n_active; // num active threads
|
|
|
|
| 19675 |
return cplan;
|
| 19676 |
}
|
| 19677 |
|
| 19678 |
+
static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
|
| 19679 |
+
enum ggml_status compute_status = GGML_STATUS_SUCCESS;
|
| 19680 |
+
|
| 19681 |
+
#ifdef GGML_USE_OPENMP
|
| 19682 |
+
if (n_threads > 1) {
|
| 19683 |
+
#pragma omp parallel num_threads(n_threads)
|
| 19684 |
+
{
|
| 19685 |
+
#pragma omp single
|
| 19686 |
+
{
|
| 19687 |
+
// update the number of threads from the actual number of threads that we got from OpenMP
|
| 19688 |
+
n_threads = omp_get_num_threads();
|
| 19689 |
+
workers[0].shared->n_threads = n_threads;
|
| 19690 |
+
workers[0].shared->n_active = n_threads;
|
| 19691 |
+
}
|
| 19692 |
+
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
| 19693 |
+
}
|
| 19694 |
+
} else {
|
| 19695 |
+
ggml_graph_compute_thread(&workers[0]);
|
| 19696 |
+
}
|
| 19697 |
+
#else
|
| 19698 |
+
// create thread pool
|
| 19699 |
+
if (n_threads > 1) {
|
| 19700 |
+
for (int j = 1; j < n_threads; ++j) {
|
| 19701 |
+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
| 19702 |
+
GGML_ASSERT(rc == 0);
|
| 19703 |
+
UNUSED(rc);
|
| 19704 |
+
}
|
| 19705 |
+
}
|
| 19706 |
+
|
| 19707 |
+
// this is a work thread too
|
| 19708 |
+
ggml_graph_compute_thread(&workers[0]);
|
| 19709 |
+
|
| 19710 |
+
// join or kill thread pool
|
| 19711 |
+
if (n_threads > 1) {
|
| 19712 |
+
for (int j = 1; j < n_threads; j++) {
|
| 19713 |
+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
| 19714 |
+
GGML_ASSERT(rc == 0);
|
| 19715 |
+
UNUSED(rc);
|
| 19716 |
+
}
|
| 19717 |
+
}
|
| 19718 |
+
#endif
|
| 19719 |
+
// don't leave affinity set on the main thread
|
| 19720 |
+
clear_numa_thread_affinity();
|
| 19721 |
+
|
| 19722 |
+
for (int j = 0; j < n_threads; j++) {
|
| 19723 |
+
if (workers[j].ec != GGML_STATUS_SUCCESS) {
|
| 19724 |
+
compute_status = workers[j].ec;
|
| 19725 |
+
break;
|
| 19726 |
+
}
|
| 19727 |
+
}
|
| 19728 |
+
return compute_status;
|
| 19729 |
+
}
|
| 19730 |
+
|
| 19731 |
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
| 19732 |
{
|
| 19733 |
GGML_ASSERT(cplan);
|
|
|
|
| 19738 |
}
|
| 19739 |
}
|
| 19740 |
|
| 19741 |
+
int n_threads = cplan->n_threads;
|
| 19742 |
+
|
| 19743 |
+
#if defined(GGML_USE_OPENMP)
|
| 19744 |
+
n_threads = MIN(n_threads, omp_get_max_threads());
|
| 19745 |
+
#endif
|
| 19746 |
|
| 19747 |
struct ggml_compute_state_shared state_shared = {
|
| 19748 |
/*.cgraph =*/ cgraph,
|
|
|
|
| 19758 |
/*.current_chunk; =*/ 0,
|
| 19759 |
};
|
| 19760 |
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19761 |
const int64_t perf_start_cycles = ggml_perf_cycles();
|
| 19762 |
const int64_t perf_start_time_us = ggml_perf_time_us();
|
| 19763 |
|
| 19764 |
+
for (int j = 0; j < n_threads; ++j) {
|
| 19765 |
+
workers[j] = (struct ggml_compute_state) {
|
| 19766 |
+
.thrd = 0,
|
| 19767 |
+
.ith = j,
|
| 19768 |
+
.shared = &state_shared,
|
| 19769 |
+
.ec = GGML_STATUS_SUCCESS,
|
| 19770 |
+
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19771 |
}
|
| 19772 |
|
| 19773 |
+
enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
|
| 19774 |
+
|
| 19775 |
// performance stats (graph)
|
| 19776 |
{
|
| 19777 |
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|