Max Krasnyansky Diego Devesa commited on
Commit
d5d55f2
·
1 Parent(s): a75e157

threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling (llama/12995)

Browse files

* threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling

We talked about adding LOW priority for GGML threads in the original threadpool PR.
It might be useful for some cases to avoid contention.

Latest Windows ARM64 releases started parking (offlining) the CPU cores
more aggresively which results in suboptimal performance with n_threads > 4.
To deal with that we now disable Power Throttling for our threads for the NORMAL
and higher priorities.

Co-authored-by: Diego Devesa <[email protected]>

* threading: disable SetThreadInfo() calls for older Windows versions

* Update tools/llama-bench/llama-bench.cpp

Co-authored-by: Diego Devesa <[email protected]>

---------

Co-authored-by: Diego Devesa <[email protected]>

ggml/include/ggml.h CHANGED
@@ -2178,6 +2178,7 @@ extern "C" {
2178
 
2179
  // scheduling priorities
2180
  enum ggml_sched_priority {
 
2181
  GGML_SCHED_PRIO_NORMAL,
2182
  GGML_SCHED_PRIO_MEDIUM,
2183
  GGML_SCHED_PRIO_HIGH,
 
2178
 
2179
  // scheduling priorities
2180
  enum ggml_sched_priority {
2181
+ GGML_SCHED_PRIO_LOW = -1,
2182
  GGML_SCHED_PRIO_NORMAL,
2183
  GGML_SCHED_PRIO_MEDIUM,
2184
  GGML_SCHED_PRIO_HIGH,
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -2418,12 +2418,32 @@ static bool ggml_thread_apply_priority(int32_t prio) {
2418
  // This is up to the applications.
2419
  DWORD p = THREAD_PRIORITY_NORMAL;
2420
  switch (prio) {
 
2421
  case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
2422
  case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
2423
  case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
2424
  case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
2425
  }
2426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2427
  if (prio == GGML_SCHED_PRIO_NORMAL) {
2428
  // Keep inherited policy/priority
2429
  return true;
@@ -2451,6 +2471,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
2451
  struct sched_param p;
2452
  int32_t policy = SCHED_OTHER;
2453
  switch (prio) {
 
 
2454
  case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
2455
  case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
2456
  case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
@@ -2507,6 +2529,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
2507
  struct sched_param p;
2508
  int32_t policy = SCHED_OTHER;
2509
  switch (prio) {
 
2510
  case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
2511
  case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
2512
  case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
 
2418
  // This is up to the applications.
2419
  DWORD p = THREAD_PRIORITY_NORMAL;
2420
  switch (prio) {
2421
+ case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
2422
  case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
2423
  case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
2424
  case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
2425
  case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
2426
  }
2427
 
2428
+ if (prio != GGML_SCHED_PRIO_LOW) {
2429
+ // Tell Windows that this thread should not be throttled (needs its own CPU core).
2430
+ // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
2431
+ // all our threads onto the first 4 cores which results in terrible performance with
2432
+ // n_threads > 4
2433
+ #if _WIN32_WINNT >= 0x0602
2434
+ THREAD_POWER_THROTTLING_STATE t;
2435
+ ZeroMemory(&t, sizeof(t));
2436
+ t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
2437
+ t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
2438
+ t.StateMask = 0;
2439
+
2440
+ if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
2441
+ GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
2442
+ return false;
2443
+ }
2444
+ #endif
2445
+ }
2446
+
2447
  if (prio == GGML_SCHED_PRIO_NORMAL) {
2448
  // Keep inherited policy/priority
2449
  return true;
 
2471
  struct sched_param p;
2472
  int32_t policy = SCHED_OTHER;
2473
  switch (prio) {
2474
+ // TODO: there seems to be no way to set lower prio on Apple platforms
2475
+ case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
2476
  case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
2477
  case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
2478
  case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
 
2529
  struct sched_param p;
2530
  int32_t policy = SCHED_OTHER;
2531
  switch (prio) {
2532
+ case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
2533
  case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
2534
  case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
2535
  case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;