Spaces:
Running
llama : add pipeline parallelism support (llama/6017)
Browse files* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs
ggml-ci
* server : add -ub, --ubatch-size parameter
* fix server embedding test
* llama : fix Mamba inference for pipeline parallelism
Tested to work correctly with both `main` and `parallel` examples.
* llama : limit max batch size to n_batch
* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)
changing this value may improve performance for some systems, but increases memory usage
* fix hip build
* fix sycl build (disable cpy_tensor_async)
* fix hip build
* llama : limit n_batch and n_ubatch to n_ctx during context creation
* llama : fix norm backend
* batched-bench : sync after decode
* swiftui : sync after decode
* ggml : allow ggml_get_rows to use multiple threads if they are available
* check n_ubatch >= n_tokens with non-casual attention
* llama : do not limit n_batch to n_ctx with non-casual attn
* server : construct batch with size of llama_n_batch
* ggml_backend_cpu_graph_compute : fix return value when alloc fails
* llama : better n_batch and n_ubatch comment
* fix merge
* small fix
* reduce default n_batch to 2048
---------
Co-authored-by: Francis Couture-Harpin <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>
- ggml-alloc.c +45 -64
- ggml-alloc.h +13 -5
- ggml-backend-impl.h +14 -3
- ggml-backend.c +358 -135
- ggml-backend.h +41 -17
- ggml-cuda.cu +146 -29
- ggml-kompute.cpp +5 -0
- ggml-metal.m +5 -0
- ggml-sycl.cpp +6 -1
- ggml-vulkan.cpp +5 -0
- ggml.c +68 -45
|
@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
| 61 |
}
|
| 62 |
}
|
| 63 |
|
| 64 |
-
// TODO: GGML_PAD ?
|
| 65 |
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
| 66 |
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
| 67 |
size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
|
|
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
|
|
| 69 |
}
|
| 70 |
|
| 71 |
// tallocr
|
| 72 |
-
struct ggml_tallocr {
|
| 73 |
-
ggml_backend_buffer_t buffer;
|
| 74 |
-
void * base;
|
| 75 |
-
size_t alignment;
|
| 76 |
-
size_t offset;
|
| 77 |
-
};
|
| 78 |
-
|
| 79 |
-
ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
| 80 |
-
ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
|
| 81 |
-
if (talloc == NULL) {
|
| 82 |
-
return NULL;
|
| 83 |
-
}
|
| 84 |
|
|
|
|
| 85 |
void * base = ggml_backend_buffer_get_base(buffer);
|
| 86 |
size_t align = ggml_backend_buffer_get_alignment(buffer);
|
| 87 |
|
| 88 |
assert(align && !(align & (align - 1))); // power of 2
|
| 89 |
|
| 90 |
-
|
| 91 |
/*.buffer = */ buffer,
|
| 92 |
/*.base = */ base,
|
| 93 |
/*.alignment = */ align,
|
|
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
|
| 96 |
return talloc;
|
| 97 |
}
|
| 98 |
|
| 99 |
-
void
|
| 100 |
-
free(talloc);
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
|
| 104 |
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
|
| 105 |
size = GGML_PAD(size, talloc->alignment);
|
| 106 |
|
|
@@ -354,12 +338,16 @@ struct hash_node {
|
|
| 354 |
bool allocated;
|
| 355 |
};
|
| 356 |
|
| 357 |
-
//
|
| 358 |
struct tensor_alloc {
|
| 359 |
size_t offset;
|
| 360 |
size_t size_max; // 0 = pre-allocated, unused, or view
|
| 361 |
};
|
| 362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
struct node_alloc {
|
| 364 |
int buffer_id;
|
| 365 |
struct tensor_alloc dst;
|
|
@@ -378,7 +366,7 @@ struct ggml_gallocr {
|
|
| 378 |
struct node_alloc * node_allocs; // [n_nodes]
|
| 379 |
int n_nodes;
|
| 380 |
|
| 381 |
-
struct
|
| 382 |
int n_leafs;
|
| 383 |
};
|
| 384 |
|
|
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
|
| 543 |
return node_buffer_ids ? node_buffer_ids[i] : 0;
|
| 544 |
}
|
| 545 |
|
| 546 |
-
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
| 547 |
// clear hash tables
|
| 548 |
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
| 549 |
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
| 550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
// count number of children and views
|
| 552 |
-
// allocate
|
| 553 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 554 |
struct ggml_tensor * node = graph->nodes[i];
|
| 555 |
|
|
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
| 577 |
}
|
| 578 |
}
|
| 579 |
|
| 580 |
-
// allocate the remaining leafs that are unused on the graph
|
| 581 |
-
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
| 582 |
-
for (int i = 0; i < graph->n_leafs; i++) {
|
| 583 |
-
struct ggml_tensor * leaf = graph->leafs[i];
|
| 584 |
-
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
| 585 |
-
|
| 586 |
-
if (hn->n_children == 0) {
|
| 587 |
-
assert(!hn->allocated);
|
| 588 |
-
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
| 589 |
-
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
| 590 |
-
}
|
| 591 |
-
}
|
| 592 |
-
|
| 593 |
// allocate tensors
|
| 594 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 595 |
struct ggml_tensor * node = graph->nodes[i];
|
|
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
| 652 |
}
|
| 653 |
}
|
| 654 |
|
| 655 |
-
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
| 656 |
size_t hash_size = graph->visited_hash_table.size;
|
| 657 |
|
| 658 |
// initialize hash table
|
|
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 676 |
}
|
| 677 |
|
| 678 |
// allocate in hash table
|
| 679 |
-
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
|
| 680 |
|
| 681 |
// set the node_allocs from the hash table
|
| 682 |
if (galloc->n_nodes < graph->n_nodes) {
|
|
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 711 |
}
|
| 712 |
if (galloc->n_leafs < graph->n_leafs) {
|
| 713 |
free(galloc->leaf_allocs);
|
| 714 |
-
galloc->leaf_allocs = calloc(sizeof(
|
| 715 |
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
| 716 |
}
|
| 717 |
galloc->n_leafs = graph->n_leafs;
|
| 718 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 719 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 720 |
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
| 721 |
-
galloc->leaf_allocs[i].
|
| 722 |
-
galloc->leaf_allocs[i].
|
|
|
|
| 723 |
}
|
| 724 |
|
| 725 |
// reallocate buffers if needed
|
|
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 727 |
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
| 728 |
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
| 729 |
|
| 730 |
-
if
|
|
|
|
| 731 |
#ifndef NDEBUG
|
| 732 |
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
| 733 |
#endif
|
|
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 744 |
}
|
| 745 |
|
| 746 |
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
| 747 |
-
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
| 748 |
}
|
| 749 |
|
| 750 |
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
| 751 |
-
assert(
|
| 752 |
|
| 753 |
-
if (
|
| 754 |
-
if (
|
| 755 |
assert(tensor_alloc->offset == SIZE_MAX);
|
| 756 |
-
if (
|
| 757 |
// this tensor was allocated without ggml-backend
|
| 758 |
return;
|
| 759 |
}
|
| 760 |
-
ggml_backend_view_init(galloc->buffers[buffer_id],
|
| 761 |
}
|
| 762 |
} else {
|
| 763 |
-
if (
|
| 764 |
assert(tensor_alloc->offset != SIZE_MAX);
|
| 765 |
-
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id],
|
| 766 |
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
| 767 |
void * addr = (char *)base + tensor_alloc->offset;
|
| 768 |
-
ggml_backend_tensor_alloc(galloc->buffers[buffer_id],
|
| 769 |
} else {
|
| 770 |
-
if (
|
| 771 |
// this tensor was allocated without ggml-backend
|
| 772 |
return;
|
| 773 |
}
|
|
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
| 843 |
|
| 844 |
// reset buffers
|
| 845 |
for (int i = 0; i < galloc->n_buffers; i++) {
|
| 846 |
-
// zero size buffers are not allocated
|
| 847 |
if (galloc->buffers[i] != NULL) {
|
| 848 |
ggml_backend_buffer_reset(galloc->buffers[i]);
|
| 849 |
}
|
| 850 |
}
|
| 851 |
|
| 852 |
// allocate the graph tensors from the previous assignments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 853 |
// nodes
|
| 854 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 855 |
struct ggml_tensor * node = graph->nodes[i];
|
|
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
| 863 |
}
|
| 864 |
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
| 865 |
}
|
| 866 |
-
// leafs
|
| 867 |
-
for (int i = 0; i < graph->n_leafs; i++) {
|
| 868 |
-
struct ggml_tensor * leaf = graph->leafs[i];
|
| 869 |
-
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
| 870 |
-
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
| 871 |
-
}
|
| 872 |
|
| 873 |
return true;
|
| 874 |
}
|
|
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
| 900 |
return false;
|
| 901 |
}
|
| 902 |
|
| 903 |
-
struct ggml_tallocr
|
| 904 |
|
| 905 |
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
| 906 |
if (t->data == NULL) {
|
| 907 |
if (t->view_src == NULL) {
|
| 908 |
-
ggml_tallocr_alloc(tallocr, t);
|
| 909 |
} else if (t->buffer == NULL) {
|
| 910 |
ggml_backend_view_init(buffer, t);
|
| 911 |
}
|
|
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
| 917 |
}
|
| 918 |
}
|
| 919 |
|
| 920 |
-
ggml_tallocr_free(tallocr);
|
| 921 |
-
|
| 922 |
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
| 923 |
(*buffers)[(*n_buffers)++] = buffer;
|
| 924 |
|
|
|
|
| 61 |
}
|
| 62 |
}
|
| 63 |
|
|
|
|
| 64 |
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
| 65 |
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
| 66 |
size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
|
|
|
|
| 68 |
}
|
| 69 |
|
| 70 |
// tallocr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
+
struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
| 73 |
void * base = ggml_backend_buffer_get_base(buffer);
|
| 74 |
size_t align = ggml_backend_buffer_get_alignment(buffer);
|
| 75 |
|
| 76 |
assert(align && !(align & (align - 1))); // power of 2
|
| 77 |
|
| 78 |
+
struct ggml_tallocr talloc = (struct ggml_tallocr) {
|
| 79 |
/*.buffer = */ buffer,
|
| 80 |
/*.base = */ base,
|
| 81 |
/*.alignment = */ align,
|
|
|
|
| 84 |
return talloc;
|
| 85 |
}
|
| 86 |
|
| 87 |
+
void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
|
| 89 |
size = GGML_PAD(size, talloc->alignment);
|
| 90 |
|
|
|
|
| 338 |
bool allocated;
|
| 339 |
};
|
| 340 |
|
|
|
|
| 341 |
struct tensor_alloc {
|
| 342 |
size_t offset;
|
| 343 |
size_t size_max; // 0 = pre-allocated, unused, or view
|
| 344 |
};
|
| 345 |
|
| 346 |
+
struct leaf_alloc {
|
| 347 |
+
int buffer_id;
|
| 348 |
+
struct tensor_alloc leaf;
|
| 349 |
+
};
|
| 350 |
+
|
| 351 |
struct node_alloc {
|
| 352 |
int buffer_id;
|
| 353 |
struct tensor_alloc dst;
|
|
|
|
| 366 |
struct node_alloc * node_allocs; // [n_nodes]
|
| 367 |
int n_nodes;
|
| 368 |
|
| 369 |
+
struct leaf_alloc * leaf_allocs; // [n_leafs]
|
| 370 |
int n_leafs;
|
| 371 |
};
|
| 372 |
|
|
|
|
| 531 |
return node_buffer_ids ? node_buffer_ids[i] : 0;
|
| 532 |
}
|
| 533 |
|
| 534 |
+
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
| 535 |
// clear hash tables
|
| 536 |
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
| 537 |
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
| 538 |
|
| 539 |
+
// allocate leafs
|
| 540 |
+
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
| 541 |
+
for (int i = 0; i < graph->n_leafs; i++) {
|
| 542 |
+
struct ggml_tensor * leaf = graph->leafs[i];
|
| 543 |
+
ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
// count number of children and views
|
| 547 |
+
// allocate other graph inputs and leafs first to avoid overwriting them
|
| 548 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 549 |
struct ggml_tensor * node = graph->nodes[i];
|
| 550 |
|
|
|
|
| 572 |
}
|
| 573 |
}
|
| 574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
// allocate tensors
|
| 576 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 577 |
struct ggml_tensor * node = graph->nodes[i];
|
|
|
|
| 634 |
}
|
| 635 |
}
|
| 636 |
|
| 637 |
+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
| 638 |
size_t hash_size = graph->visited_hash_table.size;
|
| 639 |
|
| 640 |
// initialize hash table
|
|
|
|
| 658 |
}
|
| 659 |
|
| 660 |
// allocate in hash table
|
| 661 |
+
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
|
| 662 |
|
| 663 |
// set the node_allocs from the hash table
|
| 664 |
if (galloc->n_nodes < graph->n_nodes) {
|
|
|
|
| 693 |
}
|
| 694 |
if (galloc->n_leafs < graph->n_leafs) {
|
| 695 |
free(galloc->leaf_allocs);
|
| 696 |
+
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
|
| 697 |
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
| 698 |
}
|
| 699 |
galloc->n_leafs = graph->n_leafs;
|
| 700 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 701 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 702 |
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
| 703 |
+
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
| 704 |
+
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
| 705 |
+
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
| 706 |
}
|
| 707 |
|
| 708 |
// reallocate buffers if needed
|
|
|
|
| 710 |
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
| 711 |
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
| 712 |
|
| 713 |
+
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
| 714 |
+
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
| 715 |
#ifndef NDEBUG
|
| 716 |
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
| 717 |
#endif
|
|
|
|
| 728 |
}
|
| 729 |
|
| 730 |
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
| 731 |
+
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
| 732 |
}
|
| 733 |
|
| 734 |
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
| 735 |
+
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
| 736 |
|
| 737 |
+
if (tensor->view_src != NULL) {
|
| 738 |
+
if (tensor->buffer == NULL) {
|
| 739 |
assert(tensor_alloc->offset == SIZE_MAX);
|
| 740 |
+
if (tensor->view_src->buffer == NULL) {
|
| 741 |
// this tensor was allocated without ggml-backend
|
| 742 |
return;
|
| 743 |
}
|
| 744 |
+
ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
|
| 745 |
}
|
| 746 |
} else {
|
| 747 |
+
if (tensor->data == NULL) {
|
| 748 |
assert(tensor_alloc->offset != SIZE_MAX);
|
| 749 |
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
| 750 |
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
| 751 |
void * addr = (char *)base + tensor_alloc->offset;
|
| 752 |
+
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
|
| 753 |
} else {
|
| 754 |
+
if (tensor->buffer == NULL) {
|
| 755 |
// this tensor was allocated without ggml-backend
|
| 756 |
return;
|
| 757 |
}
|
|
|
|
| 827 |
|
| 828 |
// reset buffers
|
| 829 |
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
|
|
| 830 |
if (galloc->buffers[i] != NULL) {
|
| 831 |
ggml_backend_buffer_reset(galloc->buffers[i]);
|
| 832 |
}
|
| 833 |
}
|
| 834 |
|
| 835 |
// allocate the graph tensors from the previous assignments
|
| 836 |
+
// leafs
|
| 837 |
+
for (int i = 0; i < graph->n_leafs; i++) {
|
| 838 |
+
struct ggml_tensor * leaf = graph->leafs[i];
|
| 839 |
+
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
| 840 |
+
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
|
| 841 |
+
}
|
| 842 |
// nodes
|
| 843 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 844 |
struct ggml_tensor * node = graph->nodes[i];
|
|
|
|
| 852 |
}
|
| 853 |
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
| 854 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 855 |
|
| 856 |
return true;
|
| 857 |
}
|
|
|
|
| 883 |
return false;
|
| 884 |
}
|
| 885 |
|
| 886 |
+
struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
|
| 887 |
|
| 888 |
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
| 889 |
if (t->data == NULL) {
|
| 890 |
if (t->view_src == NULL) {
|
| 891 |
+
ggml_tallocr_alloc(&tallocr, t);
|
| 892 |
} else if (t->buffer == NULL) {
|
| 893 |
ggml_backend_view_init(buffer, t);
|
| 894 |
}
|
|
|
|
| 900 |
}
|
| 901 |
}
|
| 902 |
|
|
|
|
|
|
|
| 903 |
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
| 904 |
(*buffers)[(*n_buffers)++] = buffer;
|
| 905 |
|
|
@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
|
| 11 |
typedef struct ggml_backend * ggml_backend_t;
|
| 12 |
|
| 13 |
// Tensor allocator
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
GGML_API
|
| 17 |
-
GGML_API void
|
| 18 |
-
GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
|
| 19 |
|
| 20 |
// Graph allocator
|
| 21 |
/*
|
|
@@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
|
| 50 |
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
| 51 |
// returns false if the buffer allocation failed
|
| 52 |
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
| 53 |
-
GGML_API bool ggml_gallocr_reserve_n(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
// automatic reallocation if the topology changes when using a single buffer
|
| 56 |
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
|
|
|
|
| 11 |
typedef struct ggml_backend * ggml_backend_t;
|
| 12 |
|
| 13 |
// Tensor allocator
|
| 14 |
+
struct ggml_tallocr {
|
| 15 |
+
ggml_backend_buffer_t buffer;
|
| 16 |
+
void * base;
|
| 17 |
+
size_t alignment;
|
| 18 |
+
size_t offset;
|
| 19 |
+
};
|
| 20 |
|
| 21 |
+
GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
|
| 22 |
+
GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
|
|
|
|
| 23 |
|
| 24 |
// Graph allocator
|
| 25 |
/*
|
|
|
|
| 54 |
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
| 55 |
// returns false if the buffer allocation failed
|
| 56 |
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
| 57 |
+
GGML_API bool ggml_gallocr_reserve_n(
|
| 58 |
+
ggml_gallocr_t galloc,
|
| 59 |
+
struct ggml_cgraph * graph,
|
| 60 |
+
const int * node_buffer_ids,
|
| 61 |
+
const int * leaf_buffer_ids);
|
| 62 |
|
| 63 |
// automatic reallocation if the topology changes when using a single buffer
|
| 64 |
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
|
|
@@ -86,12 +86,12 @@ extern "C" {
|
|
| 86 |
// (optional) asynchronous tensor data access
|
| 87 |
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 88 |
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 89 |
-
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t
|
| 90 |
|
| 91 |
// (optional) complete all pending operations
|
| 92 |
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
| 93 |
|
| 94 |
-
//
|
| 95 |
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 96 |
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 97 |
|
|
@@ -102,16 +102,27 @@ extern "C" {
|
|
| 102 |
|
| 103 |
// check if the backend supports an operation
|
| 104 |
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
};
|
| 106 |
|
| 107 |
struct ggml_backend {
|
| 108 |
ggml_guid_t guid;
|
| 109 |
|
| 110 |
struct ggml_backend_i iface;
|
| 111 |
-
|
| 112 |
ggml_backend_context_t context;
|
| 113 |
};
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
//
|
| 116 |
// Backend registry
|
| 117 |
//
|
|
|
|
| 86 |
// (optional) asynchronous tensor data access
|
| 87 |
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 88 |
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 89 |
+
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 90 |
|
| 91 |
// (optional) complete all pending operations
|
| 92 |
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
| 93 |
|
| 94 |
+
// compute graph with a plan (not used currently)
|
| 95 |
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 96 |
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 97 |
|
|
|
|
| 102 |
|
| 103 |
// check if the backend supports an operation
|
| 104 |
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 105 |
+
|
| 106 |
+
// (optional) event synchronization
|
| 107 |
+
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
| 108 |
+
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
| 109 |
+
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
| 110 |
+
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
| 111 |
+
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
| 112 |
};
|
| 113 |
|
| 114 |
struct ggml_backend {
|
| 115 |
ggml_guid_t guid;
|
| 116 |
|
| 117 |
struct ggml_backend_i iface;
|
|
|
|
| 118 |
ggml_backend_context_t context;
|
| 119 |
};
|
| 120 |
|
| 121 |
+
struct ggml_backend_event {
|
| 122 |
+
ggml_backend_t backend;
|
| 123 |
+
void * context;
|
| 124 |
+
};
|
| 125 |
+
|
| 126 |
//
|
| 127 |
// Backend registry
|
| 128 |
//
|
|
@@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
| 221 |
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 222 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 223 |
|
| 224 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 225 |
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
|
|
| 226 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 227 |
|
| 228 |
if (!size) {
|
| 229 |
return;
|
| 230 |
}
|
| 231 |
|
| 232 |
-
|
| 233 |
}
|
| 234 |
|
| 235 |
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 236 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 237 |
|
|
|
|
| 238 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 239 |
-
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
| 240 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 241 |
|
| 242 |
if (!size) {
|
| 243 |
return;
|
| 244 |
}
|
| 245 |
|
| 246 |
-
|
| 247 |
}
|
| 248 |
|
| 249 |
void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
@@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
| 255 |
}
|
| 256 |
|
| 257 |
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
|
|
|
|
|
| 258 |
return backend->iface.graph_plan_create(backend, cgraph);
|
| 259 |
}
|
| 260 |
|
| 261 |
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
|
|
|
|
|
| 262 |
backend->iface.graph_plan_free(backend, plan);
|
| 263 |
}
|
| 264 |
|
| 265 |
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
|
|
|
|
|
| 266 |
return backend->iface.graph_plan_compute(backend, plan);
|
| 267 |
}
|
| 268 |
|
| 269 |
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
return backend->iface.graph_compute(backend, cgraph);
|
| 271 |
}
|
| 272 |
|
|
@@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
| 314 |
}
|
| 315 |
}
|
| 316 |
|
| 317 |
-
void ggml_backend_tensor_copy_async(ggml_backend_t
|
| 318 |
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
| 319 |
|
| 320 |
if (src == dst) {
|
| 321 |
return;
|
| 322 |
}
|
| 323 |
|
| 324 |
-
if (
|
| 325 |
-
if (
|
| 326 |
-
|
| 327 |
-
return;
|
| 328 |
-
}
|
| 329 |
}
|
| 330 |
}
|
| 331 |
|
| 332 |
-
|
|
|
|
| 333 |
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
else {
|
|
|
|
| 337 |
ggml_backend_tensor_copy(src, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
}
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
// backend registry
|
| 343 |
|
| 344 |
-
#define
|
| 345 |
|
| 346 |
struct ggml_backend_reg {
|
| 347 |
char name[128];
|
|
@@ -350,7 +396,7 @@ struct ggml_backend_reg {
|
|
| 350 |
void * user_data;
|
| 351 |
};
|
| 352 |
|
| 353 |
-
static struct ggml_backend_reg ggml_backend_registry[
|
| 354 |
static size_t ggml_backend_registry_count = 0;
|
| 355 |
|
| 356 |
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
|
@@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
| 395 |
}
|
| 396 |
|
| 397 |
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
| 398 |
-
GGML_ASSERT(ggml_backend_registry_count <
|
| 399 |
|
| 400 |
size_t id = ggml_backend_registry_count;
|
| 401 |
|
|
@@ -746,8 +792,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
|
|
| 746 |
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
| 747 |
|
| 748 |
if (cpu_ctx->work_size < cplan.work_size) {
|
| 749 |
-
|
| 750 |
-
cpu_ctx->work_data =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
cpu_ctx->work_size = cplan.work_size;
|
| 752 |
}
|
| 753 |
cplan.work_data = cpu_ctx->work_data;
|
|
@@ -784,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
| 784 |
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
| 785 |
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
| 786 |
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 787 |
};
|
| 788 |
|
| 789 |
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
@@ -939,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|
| 939 |
|
| 940 |
// scheduler
|
| 941 |
|
| 942 |
-
#
|
| 943 |
-
#define
|
| 944 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 945 |
|
| 946 |
struct ggml_backend_sched_split {
|
| 947 |
int backend_id;
|
| 948 |
int i_start;
|
| 949 |
int i_end;
|
| 950 |
-
struct ggml_tensor * inputs[
|
| 951 |
int n_inputs;
|
| 952 |
// graph view of this split
|
| 953 |
struct ggml_cgraph graph;
|
|
@@ -955,45 +1022,53 @@ struct ggml_backend_sched_split {
|
|
| 955 |
|
| 956 |
struct ggml_backend_sched {
|
| 957 |
bool is_reset; // true if the scheduler has been reset since the last graph split
|
|
|
|
| 958 |
|
| 959 |
int n_backends;
|
| 960 |
-
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
| 961 |
-
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
| 962 |
|
|
|
|
|
|
|
| 963 |
ggml_gallocr_t galloc;
|
| 964 |
|
| 965 |
// hash keys of the nodes in the graph
|
| 966 |
struct ggml_hash_set hash_set;
|
| 967 |
// hash values
|
| 968 |
int * tensor_backend_id;
|
| 969 |
-
struct ggml_tensor * (* tensor_copies)[
|
| 970 |
|
| 971 |
-
int * node_backend_ids; // [
|
| 972 |
-
int
|
| 973 |
|
| 974 |
// copy of the graph with modified inputs
|
| 975 |
struct ggml_cgraph * graph;
|
| 976 |
|
| 977 |
-
|
|
|
|
| 978 |
int n_splits;
|
| 979 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 980 |
struct ggml_context * ctx;
|
| 981 |
|
| 982 |
ggml_backend_sched_eval_callback callback_eval;
|
| 983 |
void * callback_eval_user_data;
|
| 984 |
|
| 985 |
// align context_buffer to GGML_MEM_ALIGN
|
| 986 |
-
|
| 987 |
__declspec(align(GGML_MEM_ALIGN))
|
| 988 |
-
|
| 989 |
__attribute__((aligned(GGML_MEM_ALIGN)))
|
| 990 |
-
|
| 991 |
-
char context_buffer[
|
| 992 |
};
|
| 993 |
|
| 994 |
-
#define hash_id(
|
| 995 |
-
#define tensor_backend_id(
|
| 996 |
-
#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
|
| 997 |
|
| 998 |
// returns the priority of the backend, lower id is higher priority
|
| 999 |
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
@@ -1005,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
| 1005 |
return -1;
|
| 1006 |
}
|
| 1007 |
|
| 1008 |
-
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched,
|
|
|
|
| 1009 |
if (buffer == NULL) {
|
| 1010 |
return -1;
|
| 1011 |
}
|
|
@@ -1016,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
|
|
| 1016 |
return i;
|
| 1017 |
}
|
| 1018 |
}
|
| 1019 |
-
|
| 1020 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1021 |
}
|
| 1022 |
|
| 1023 |
#if 0
|
| 1024 |
-
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 +
|
| 1025 |
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
| 1026 |
#define GET_CAUSE(node) causes[hash_id(node)]
|
| 1027 |
#else
|
|
@@ -1035,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
| 1035 |
|
| 1036 |
// assign pre-allocated nodes to their backend
|
| 1037 |
// dst
|
| 1038 |
-
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor
|
| 1039 |
if (cur_backend != -1) {
|
| 1040 |
-
SET_CAUSE(
|
| 1041 |
return cur_backend;
|
| 1042 |
}
|
|
|
|
| 1043 |
// view_src
|
| 1044 |
if (tensor->view_src != NULL) {
|
| 1045 |
-
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src
|
| 1046 |
if (cur_backend != -1) {
|
| 1047 |
-
SET_CAUSE(
|
| 1048 |
return cur_backend;
|
| 1049 |
}
|
| 1050 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1051 |
// assign nodes that use weights to the backend of the weights
|
| 1052 |
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 1053 |
const struct ggml_tensor * src = tensor->src[i];
|
|
@@ -1055,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
| 1055 |
continue;
|
| 1056 |
}
|
| 1057 |
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1058 |
-
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src
|
| 1059 |
// operations with weights are always run on the same backend as the weights
|
| 1060 |
-
SET_CAUSE(
|
| 1061 |
return src_backend;
|
| 1062 |
}
|
| 1063 |
}
|
|
@@ -1093,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
| 1093 |
if (ggml_is_view_op(node->op)) {
|
| 1094 |
continue;
|
| 1095 |
}
|
| 1096 |
-
ggml_backend_t tensor_backend =
|
| 1097 |
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
| 1098 |
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
| 1099 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
@@ -1101,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
| 1101 |
if (src == NULL) {
|
| 1102 |
continue;
|
| 1103 |
}
|
| 1104 |
-
ggml_backend_t src_backend =
|
| 1105 |
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
| 1106 |
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 1107 |
}
|
|
@@ -1118,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
| 1118 |
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1119 |
// reset splits
|
| 1120 |
sched->n_splits = 0;
|
|
|
|
| 1121 |
sched->is_reset = false;
|
| 1122 |
|
| 1123 |
struct ggml_init_params params = {
|
|
@@ -1163,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1163 |
}
|
| 1164 |
}
|
| 1165 |
#ifdef DEBUG_PASS1
|
| 1166 |
-
fprintf(stderr, "PASS 1 ASSIGNMENTS\n");
|
| 1167 |
#endif
|
| 1168 |
|
| 1169 |
// pass 2: expand current backend assignments
|
|
@@ -1171,10 +1261,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1171 |
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
| 1172 |
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
| 1173 |
|
| 1174 |
-
|
|
|
|
| 1175 |
{
|
| 1176 |
int cur_backend_id = -1;
|
| 1177 |
-
for (int i =
|
| 1178 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1179 |
if (ggml_is_view_op(node->op)) {
|
| 1180 |
continue;
|
|
@@ -1189,15 +1280,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1189 |
}
|
| 1190 |
} else {
|
| 1191 |
tensor_backend_id(node) = cur_backend_id;
|
| 1192 |
-
SET_CAUSE(node, "2.
|
| 1193 |
}
|
| 1194 |
}
|
| 1195 |
}
|
| 1196 |
|
| 1197 |
-
// pass 2.
|
| 1198 |
{
|
| 1199 |
int cur_backend_id = -1;
|
| 1200 |
-
for (int i =
|
| 1201 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1202 |
if (ggml_is_view_op(node->op)) {
|
| 1203 |
continue;
|
|
@@ -1212,15 +1303,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1212 |
}
|
| 1213 |
} else {
|
| 1214 |
tensor_backend_id(node) = cur_backend_id;
|
| 1215 |
-
SET_CAUSE(node, "2.
|
| 1216 |
}
|
| 1217 |
}
|
| 1218 |
}
|
| 1219 |
|
| 1220 |
-
|
|
|
|
| 1221 |
{
|
| 1222 |
int cur_backend_id = -1;
|
| 1223 |
-
for (int i =
|
| 1224 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1225 |
if (ggml_is_view_op(node->op)) {
|
| 1226 |
continue;
|
|
@@ -1230,15 +1322,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1230 |
cur_backend_id = tensor_backend_id;
|
| 1231 |
} else {
|
| 1232 |
tensor_backend_id(node) = cur_backend_id;
|
| 1233 |
-
SET_CAUSE(node, "2.
|
| 1234 |
}
|
| 1235 |
}
|
| 1236 |
}
|
| 1237 |
-
|
| 1238 |
-
// pass 2.4 expand rest down
|
| 1239 |
{
|
| 1240 |
int cur_backend_id = -1;
|
| 1241 |
-
for (int i =
|
| 1242 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1243 |
if (ggml_is_view_op(node->op)) {
|
| 1244 |
continue;
|
|
@@ -1248,12 +1339,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1248 |
cur_backend_id = tensor_backend_id;
|
| 1249 |
} else {
|
| 1250 |
tensor_backend_id(node) = cur_backend_id;
|
| 1251 |
-
SET_CAUSE(node, "2.
|
| 1252 |
}
|
| 1253 |
}
|
| 1254 |
}
|
|
|
|
| 1255 |
#ifdef DEBUG_PASS2
|
| 1256 |
-
fprintf(stderr, "PASS 2 ASSIGNMENTS\n");
|
| 1257 |
#endif
|
| 1258 |
|
| 1259 |
// pass 3: assign backends to remaining src from dst and view_src
|
|
@@ -1283,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1283 |
}
|
| 1284 |
}
|
| 1285 |
#ifdef DEBUG_PASS3
|
| 1286 |
-
fprintf(stderr, "PASS 3 ASSIGNMENTS\n");
|
| 1287 |
#endif
|
| 1288 |
|
| 1289 |
// pass 4: split graph, find tensors that need to be copied
|
|
@@ -1315,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1315 |
if (tensor_backend_id != cur_backend_id) {
|
| 1316 |
sched->splits[cur_split].i_end = i;
|
| 1317 |
cur_split++;
|
| 1318 |
-
GGML_ASSERT(cur_split <
|
| 1319 |
sched->splits[cur_split].backend_id = tensor_backend_id;
|
| 1320 |
sched->splits[cur_split].i_start = i;
|
| 1321 |
sched->splits[cur_split].n_inputs = 0;
|
|
@@ -1328,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1328 |
if (src == NULL) {
|
| 1329 |
continue;
|
| 1330 |
}
|
|
|
|
| 1331 |
int src_backend_id = tensor_backend_id(src);
|
| 1332 |
assert(src_backend_id != -1); // all inputs should be assigned by now
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1333 |
if (src_backend_id != tensor_backend_id) {
|
| 1334 |
// create a copy of the input in the split's backend
|
| 1335 |
size_t id = hash_id(src);
|
| 1336 |
-
if (sched->tensor_copies[id][cur_backend_id] == NULL) {
|
| 1337 |
ggml_backend_t backend = sched->backends[cur_backend_id];
|
| 1338 |
-
|
| 1339 |
-
|
| 1340 |
-
|
| 1341 |
-
|
| 1342 |
-
|
| 1343 |
-
|
| 1344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1345 |
int n_inputs = sched->splits[cur_split].n_inputs++;
|
| 1346 |
-
GGML_ASSERT(n_inputs <
|
| 1347 |
sched->splits[cur_split].inputs[n_inputs] = src;
|
| 1348 |
}
|
| 1349 |
-
node->src[j] = sched->tensor_copies[id][cur_backend_id];
|
| 1350 |
}
|
| 1351 |
}
|
| 1352 |
}
|
|
@@ -1354,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1354 |
sched->n_splits = cur_split + 1;
|
| 1355 |
}
|
| 1356 |
#ifdef DEBUG_PASS4
|
| 1357 |
-
fprintf(stderr, "PASS 4 ASSIGNMENTS\n");
|
| 1358 |
#endif
|
| 1359 |
|
| 1360 |
#ifndef NDEBUG
|
| 1361 |
// sanity check: all sources should have the same backend as the node
|
| 1362 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1363 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1364 |
-
ggml_backend_t tensor_backend =
|
| 1365 |
if (tensor_backend == NULL) {
|
| 1366 |
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
| 1367 |
}
|
| 1368 |
-
if (node->view_src != NULL && tensor_backend !=
|
| 1369 |
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
| 1370 |
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
| 1371 |
-
node->view_src->name,
|
|
|
|
| 1372 |
}
|
| 1373 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1374 |
struct ggml_tensor * src = node->src[j];
|
| 1375 |
if (src == NULL) {
|
| 1376 |
continue;
|
| 1377 |
}
|
| 1378 |
-
ggml_backend_t src_backend =
|
| 1379 |
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
| 1380 |
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
| 1381 |
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
| 1382 |
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
| 1383 |
}
|
| 1384 |
-
if (src->view_src != NULL && src_backend !=
|
| 1385 |
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
| 1386 |
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
| 1387 |
-
src->view_src->name,
|
|
|
|
| 1388 |
}
|
| 1389 |
}
|
| 1390 |
}
|
|
@@ -1392,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1392 |
#endif
|
| 1393 |
|
| 1394 |
// create copies of the graph for each split
|
| 1395 |
-
//
|
| 1396 |
-
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*
|
| 1397 |
for (int i = 0; i < sched->n_splits; i++) {
|
| 1398 |
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1399 |
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
| 1400 |
|
|
|
|
| 1401 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1402 |
struct ggml_tensor * input = split->inputs[j];
|
| 1403 |
-
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
|
| 1404 |
|
| 1405 |
// add a dependency to the input source so that it is not freed before the copy is done
|
| 1406 |
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
|
|
|
| 1407 |
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
| 1408 |
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
| 1409 |
|
|
@@ -1417,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1417 |
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
| 1418 |
}
|
| 1419 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1420 |
sched->graph = graph_copy;
|
| 1421 |
}
|
| 1422 |
|
| 1423 |
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
| 1424 |
-
//
|
| 1425 |
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
|
|
|
|
|
|
| 1426 |
#ifndef NDEBUG
|
| 1427 |
-
fprintf(stderr, "
|
| 1428 |
#endif
|
| 1429 |
-
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
| 1430 |
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
| 1431 |
-
fprintf(stderr, "
|
| 1432 |
return false;
|
| 1433 |
}
|
| 1434 |
}
|
|
@@ -1437,9 +1603,6 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
| 1437 |
}
|
| 1438 |
|
| 1439 |
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
| 1440 |
-
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
| 1441 |
-
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
| 1442 |
-
|
| 1443 |
struct ggml_backend_sched_split * splits = sched->splits;
|
| 1444 |
|
| 1445 |
for (int i = 0; i < sched->n_splits; i++) {
|
|
@@ -1448,34 +1611,36 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
| 1448 |
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
| 1449 |
|
| 1450 |
// copy the input tensors to the split backend
|
| 1451 |
-
uint64_t copy_start_us = ggml_time_us();
|
| 1452 |
for (int j = 0; j < split->n_inputs; j++) {
|
|
|
|
| 1453 |
struct ggml_tensor * input = split->inputs[j];
|
| 1454 |
-
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
|
| 1455 |
|
| 1456 |
-
|
| 1457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1458 |
|
| 1459 |
-
|
|
|
|
| 1460 |
}
|
| 1461 |
-
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
| 1462 |
-
int64_t copy_end_us = ggml_time_us();
|
| 1463 |
-
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
| 1464 |
|
| 1465 |
-
#if 0
|
| 1466 |
-
char split_filename[GGML_MAX_NAME];
|
| 1467 |
-
snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
|
| 1468 |
-
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
| 1469 |
-
#endif
|
| 1470 |
-
|
| 1471 |
-
|
| 1472 |
-
uint64_t compute_start_us = ggml_time_us();
|
| 1473 |
if (!sched->callback_eval) {
|
| 1474 |
-
enum ggml_status ec =
|
| 1475 |
if (ec != GGML_STATUS_SUCCESS) {
|
| 1476 |
return ec;
|
| 1477 |
}
|
| 1478 |
-
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
| 1479 |
} else {
|
| 1480 |
// similar to ggml_backend_compare_graph_backend
|
| 1481 |
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
|
@@ -1494,11 +1659,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
| 1494 |
|
| 1495 |
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
| 1496 |
|
| 1497 |
-
enum ggml_status ec =
|
| 1498 |
if (ec != GGML_STATUS_SUCCESS) {
|
| 1499 |
return ec;
|
| 1500 |
}
|
| 1501 |
|
|
|
|
|
|
|
|
|
|
| 1502 |
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
| 1503 |
break;
|
| 1504 |
}
|
|
@@ -1506,39 +1674,54 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
| 1506 |
j0 = j1;
|
| 1507 |
}
|
| 1508 |
}
|
| 1509 |
-
uint64_t compute_end_us = ggml_time_us();
|
| 1510 |
-
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
| 1511 |
-
}
|
| 1512 |
|
| 1513 |
-
|
| 1514 |
-
|
| 1515 |
-
|
| 1516 |
-
|
| 1517 |
-
|
| 1518 |
-
fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
|
| 1519 |
}
|
| 1520 |
}
|
| 1521 |
-
|
|
|
|
| 1522 |
|
| 1523 |
return GGML_STATUS_SUCCESS;
|
| 1524 |
}
|
| 1525 |
|
| 1526 |
-
ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1527 |
GGML_ASSERT(n_backends > 0);
|
| 1528 |
-
GGML_ASSERT(n_backends <=
|
|
|
|
| 1529 |
|
| 1530 |
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
| 1531 |
|
| 1532 |
// initialize hash table
|
| 1533 |
-
sched->hash_set = ggml_hash_set_new(graph_size +
|
| 1534 |
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
| 1535 |
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
| 1536 |
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
|
|
|
| 1537 |
|
| 1538 |
sched->n_backends = n_backends;
|
| 1539 |
-
|
| 1540 |
-
|
| 1541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1542 |
}
|
| 1543 |
|
| 1544 |
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
|
@@ -1552,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
| 1552 |
if (sched == NULL) {
|
| 1553 |
return;
|
| 1554 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1555 |
ggml_gallocr_free(sched->galloc);
|
| 1556 |
ggml_free(sched->ctx);
|
| 1557 |
free(sched->hash_set.keys);
|
| 1558 |
free(sched->tensor_backend_id);
|
| 1559 |
free(sched->tensor_copies);
|
| 1560 |
free(sched->node_backend_ids);
|
|
|
|
| 1561 |
free(sched);
|
| 1562 |
}
|
| 1563 |
|
|
@@ -1569,34 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
| 1569 |
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
| 1570 |
|
| 1571 |
sched->is_reset = true;
|
|
|
|
| 1572 |
}
|
| 1573 |
|
| 1574 |
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
| 1575 |
ggml_backend_sched_split_graph(sched, measure_graph);
|
| 1576 |
|
| 1577 |
-
|
|
|
|
| 1578 |
return false;
|
| 1579 |
}
|
| 1580 |
|
| 1581 |
ggml_backend_sched_reset(sched);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1582 |
return true;
|
| 1583 |
}
|
| 1584 |
|
| 1585 |
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1586 |
-
|
|
|
|
|
|
|
|
|
|
| 1587 |
|
| 1588 |
-
|
|
|
|
| 1589 |
ggml_backend_sched_reset(sched);
|
| 1590 |
}
|
| 1591 |
|
| 1592 |
-
|
| 1593 |
-
|
| 1594 |
-
|
|
|
|
| 1595 |
}
|
| 1596 |
|
| 1597 |
return ggml_backend_sched_compute_splits(sched);
|
| 1598 |
}
|
| 1599 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1600 |
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
| 1601 |
sched->callback_eval = callback;
|
| 1602 |
sched->callback_eval_user_data = user_data;
|
|
@@ -1606,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
|
| 1606 |
return sched->n_splits;
|
| 1607 |
}
|
| 1608 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1609 |
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 1610 |
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 1611 |
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
|
|
| 1612 |
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
| 1613 |
}
|
| 1614 |
|
| 1615 |
-
void
|
| 1616 |
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 1617 |
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1618 |
tensor_backend_id(node) = backend_index;
|
| 1619 |
}
|
| 1620 |
|
| 1621 |
-
ggml_backend_t
|
| 1622 |
int backend_index = tensor_backend_id(node);
|
| 1623 |
if (backend_index == -1) {
|
| 1624 |
return NULL;
|
|
|
|
| 221 |
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 222 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 223 |
|
|
|
|
| 224 |
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 225 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 226 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 227 |
|
| 228 |
if (!size) {
|
| 229 |
return;
|
| 230 |
}
|
| 231 |
|
| 232 |
+
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
| 233 |
}
|
| 234 |
|
| 235 |
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 236 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 237 |
|
| 238 |
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 239 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
|
|
| 240 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 241 |
|
| 242 |
if (!size) {
|
| 243 |
return;
|
| 244 |
}
|
| 245 |
|
| 246 |
+
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
| 247 |
}
|
| 248 |
|
| 249 |
void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
|
|
| 255 |
}
|
| 256 |
|
| 257 |
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 258 |
+
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
| 259 |
+
|
| 260 |
return backend->iface.graph_plan_create(backend, cgraph);
|
| 261 |
}
|
| 262 |
|
| 263 |
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 264 |
+
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
| 265 |
+
|
| 266 |
backend->iface.graph_plan_free(backend, plan);
|
| 267 |
}
|
| 268 |
|
| 269 |
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 270 |
+
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
| 271 |
+
|
| 272 |
return backend->iface.graph_plan_compute(backend, plan);
|
| 273 |
}
|
| 274 |
|
| 275 |
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 276 |
+
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
|
| 277 |
+
ggml_backend_synchronize(backend);
|
| 278 |
+
return err;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 282 |
return backend->iface.graph_compute(backend, cgraph);
|
| 283 |
}
|
| 284 |
|
|
|
|
| 326 |
}
|
| 327 |
}
|
| 328 |
|
| 329 |
+
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 330 |
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
| 331 |
|
| 332 |
if (src == dst) {
|
| 333 |
return;
|
| 334 |
}
|
| 335 |
|
| 336 |
+
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
| 337 |
+
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
| 338 |
+
return;
|
|
|
|
|
|
|
| 339 |
}
|
| 340 |
}
|
| 341 |
|
| 342 |
+
// an async copy would normally happen after all the queued operations on both backends are completed
|
| 343 |
+
// sync src, set_async dst
|
| 344 |
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 345 |
+
ggml_backend_synchronize(backend_src);
|
| 346 |
+
ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
|
| 347 |
+
} else {
|
| 348 |
+
ggml_backend_synchronize(backend_src);
|
| 349 |
ggml_backend_tensor_copy(src, dst);
|
| 350 |
+
ggml_backend_synchronize(backend_dst);
|
| 351 |
+
}
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
// events
|
| 355 |
+
|
| 356 |
+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
|
| 357 |
+
if (backend->iface.event_new == NULL) {
|
| 358 |
+
return NULL;
|
| 359 |
+
}
|
| 360 |
+
return backend->iface.event_new(backend);
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
void ggml_backend_event_free(ggml_backend_event_t event) {
|
| 364 |
+
if (event == NULL) {
|
| 365 |
+
return;
|
| 366 |
}
|
| 367 |
+
event->backend->iface.event_free(event);
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
void ggml_backend_event_record(ggml_backend_event_t event) {
|
| 371 |
+
GGML_ASSERT(event->backend->iface.event_record != NULL);
|
| 372 |
+
|
| 373 |
+
event->backend->iface.event_record(event);
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
| 377 |
+
GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
|
| 378 |
+
|
| 379 |
+
event->backend->iface.event_synchronize(event);
|
| 380 |
}
|
| 381 |
|
| 382 |
+
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
| 383 |
+
GGML_ASSERT(backend->iface.event_wait != NULL);
|
| 384 |
+
|
| 385 |
+
backend->iface.event_wait(backend, event);
|
| 386 |
+
}
|
| 387 |
|
| 388 |
// backend registry
|
| 389 |
|
| 390 |
+
#define GGML_REG_MAX_BACKENDS 16
|
| 391 |
|
| 392 |
struct ggml_backend_reg {
|
| 393 |
char name[128];
|
|
|
|
| 396 |
void * user_data;
|
| 397 |
};
|
| 398 |
|
| 399 |
+
static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
|
| 400 |
static size_t ggml_backend_registry_count = 0;
|
| 401 |
|
| 402 |
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
|
|
|
| 441 |
}
|
| 442 |
|
| 443 |
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
| 444 |
+
GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
|
| 445 |
|
| 446 |
size_t id = ggml_backend_registry_count;
|
| 447 |
|
|
|
|
| 792 |
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
| 793 |
|
| 794 |
if (cpu_ctx->work_size < cplan.work_size) {
|
| 795 |
+
free(cpu_ctx->work_data);
|
| 796 |
+
cpu_ctx->work_data = malloc(cplan.work_size);
|
| 797 |
+
if (cpu_ctx->work_data == NULL) {
|
| 798 |
+
cpu_ctx->work_size = 0;
|
| 799 |
+
return GGML_STATUS_ALLOC_FAILED;
|
| 800 |
+
}
|
| 801 |
cpu_ctx->work_size = cplan.work_size;
|
| 802 |
}
|
| 803 |
cplan.work_data = cpu_ctx->work_data;
|
|
|
|
| 834 |
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
| 835 |
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
| 836 |
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
| 837 |
+
/* .event_new = */ NULL,
|
| 838 |
+
/* .event_free = */ NULL,
|
| 839 |
+
/* .event_record = */ NULL,
|
| 840 |
+
/* .event_wait = */ NULL,
|
| 841 |
+
/* .event_synchronize = */ NULL,
|
| 842 |
};
|
| 843 |
|
| 844 |
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
|
|
| 994 |
|
| 995 |
// scheduler
|
| 996 |
|
| 997 |
+
#ifndef GGML_SCHED_MAX_BACKENDS
|
| 998 |
+
#define GGML_SCHED_MAX_BACKENDS 16
|
| 999 |
+
#endif
|
| 1000 |
+
|
| 1001 |
+
#ifndef GGML_SCHED_MAX_SPLITS
|
| 1002 |
+
#define GGML_SCHED_MAX_SPLITS 256
|
| 1003 |
+
#endif
|
| 1004 |
+
|
| 1005 |
+
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
| 1006 |
+
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
|
| 1007 |
+
#endif
|
| 1008 |
+
|
| 1009 |
+
#ifndef GGML_SCHED_MAX_COPIES
|
| 1010 |
+
#define GGML_SCHED_MAX_COPIES 4
|
| 1011 |
+
#endif
|
| 1012 |
|
| 1013 |
struct ggml_backend_sched_split {
|
| 1014 |
int backend_id;
|
| 1015 |
int i_start;
|
| 1016 |
int i_end;
|
| 1017 |
+
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
| 1018 |
int n_inputs;
|
| 1019 |
// graph view of this split
|
| 1020 |
struct ggml_cgraph graph;
|
|
|
|
| 1022 |
|
| 1023 |
struct ggml_backend_sched {
|
| 1024 |
bool is_reset; // true if the scheduler has been reset since the last graph split
|
| 1025 |
+
bool is_alloc;
|
| 1026 |
|
| 1027 |
int n_backends;
|
|
|
|
|
|
|
| 1028 |
|
| 1029 |
+
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
|
| 1030 |
+
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
| 1031 |
ggml_gallocr_t galloc;
|
| 1032 |
|
| 1033 |
// hash keys of the nodes in the graph
|
| 1034 |
struct ggml_hash_set hash_set;
|
| 1035 |
// hash values
|
| 1036 |
int * tensor_backend_id;
|
| 1037 |
+
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
| 1038 |
|
| 1039 |
+
int * node_backend_ids; // [graph_size]
|
| 1040 |
+
int * leaf_backend_ids; // [graph_size]
|
| 1041 |
|
| 1042 |
// copy of the graph with modified inputs
|
| 1043 |
struct ggml_cgraph * graph;
|
| 1044 |
|
| 1045 |
+
// graph splits
|
| 1046 |
+
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
|
| 1047 |
int n_splits;
|
| 1048 |
|
| 1049 |
+
// pipeline parallelism support
|
| 1050 |
+
int n_copies;
|
| 1051 |
+
int cur_copy;
|
| 1052 |
+
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
| 1053 |
+
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
| 1054 |
+
int n_graph_inputs;
|
| 1055 |
+
|
| 1056 |
struct ggml_context * ctx;
|
| 1057 |
|
| 1058 |
ggml_backend_sched_eval_callback callback_eval;
|
| 1059 |
void * callback_eval_user_data;
|
| 1060 |
|
| 1061 |
// align context_buffer to GGML_MEM_ALIGN
|
| 1062 |
+
#ifdef _MSC_VER
|
| 1063 |
__declspec(align(GGML_MEM_ALIGN))
|
| 1064 |
+
#else
|
| 1065 |
__attribute__((aligned(GGML_MEM_ALIGN)))
|
| 1066 |
+
#endif
|
| 1067 |
+
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
| 1068 |
};
|
| 1069 |
|
| 1070 |
+
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
|
| 1071 |
+
#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
|
|
|
|
| 1072 |
|
| 1073 |
// returns the priority of the backend, lower id is higher priority
|
| 1074 |
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
|
|
| 1080 |
return -1;
|
| 1081 |
}
|
| 1082 |
|
| 1083 |
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
|
| 1084 |
+
ggml_backend_buffer_t buffer = tensor->buffer;
|
| 1085 |
if (buffer == NULL) {
|
| 1086 |
return -1;
|
| 1087 |
}
|
|
|
|
| 1092 |
return i;
|
| 1093 |
}
|
| 1094 |
}
|
| 1095 |
+
|
| 1096 |
+
fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
|
| 1097 |
+
__func__, ggml_backend_buffer_name(buffer), tensor->name);
|
| 1098 |
+
GGML_ASSERT(false);
|
| 1099 |
+
|
| 1100 |
+
return -1;
|
| 1101 |
}
|
| 1102 |
|
| 1103 |
#if 0
|
| 1104 |
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
| 1105 |
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
| 1106 |
#define GET_CAUSE(node) causes[hash_id(node)]
|
| 1107 |
#else
|
|
|
|
| 1115 |
|
| 1116 |
// assign pre-allocated nodes to their backend
|
| 1117 |
// dst
|
| 1118 |
+
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
| 1119 |
if (cur_backend != -1) {
|
| 1120 |
+
SET_CAUSE(tensor, "1.dst");
|
| 1121 |
return cur_backend;
|
| 1122 |
}
|
| 1123 |
+
|
| 1124 |
// view_src
|
| 1125 |
if (tensor->view_src != NULL) {
|
| 1126 |
+
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
| 1127 |
if (cur_backend != -1) {
|
| 1128 |
+
SET_CAUSE(tensor, "1.vsrc");
|
| 1129 |
return cur_backend;
|
| 1130 |
}
|
| 1131 |
}
|
| 1132 |
+
|
| 1133 |
+
// input
|
| 1134 |
+
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1135 |
+
cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
|
| 1136 |
+
SET_CAUSE(tensor, "1.inp");
|
| 1137 |
+
return cur_backend;
|
| 1138 |
+
}
|
| 1139 |
+
|
| 1140 |
// assign nodes that use weights to the backend of the weights
|
| 1141 |
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 1142 |
const struct ggml_tensor * src = tensor->src[i];
|
|
|
|
| 1144 |
continue;
|
| 1145 |
}
|
| 1146 |
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1147 |
+
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
|
| 1148 |
// operations with weights are always run on the same backend as the weights
|
| 1149 |
+
SET_CAUSE(tensor, "1.wgt%d", i);
|
| 1150 |
return src_backend;
|
| 1151 |
}
|
| 1152 |
}
|
|
|
|
| 1182 |
if (ggml_is_view_op(node->op)) {
|
| 1183 |
continue;
|
| 1184 |
}
|
| 1185 |
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
| 1186 |
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
| 1187 |
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
| 1188 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
|
|
| 1190 |
if (src == NULL) {
|
| 1191 |
continue;
|
| 1192 |
}
|
| 1193 |
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
| 1194 |
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
| 1195 |
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 1196 |
}
|
|
|
|
| 1207 |
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1208 |
// reset splits
|
| 1209 |
sched->n_splits = 0;
|
| 1210 |
+
sched->n_graph_inputs = 0;
|
| 1211 |
sched->is_reset = false;
|
| 1212 |
|
| 1213 |
struct ggml_init_params params = {
|
|
|
|
| 1253 |
}
|
| 1254 |
}
|
| 1255 |
#ifdef DEBUG_PASS1
|
| 1256 |
+
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
| 1257 |
#endif
|
| 1258 |
|
| 1259 |
// pass 2: expand current backend assignments
|
|
|
|
| 1261 |
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
| 1262 |
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
| 1263 |
|
| 1264 |
+
|
| 1265 |
+
// pass 2.2 expand gpu down
|
| 1266 |
{
|
| 1267 |
int cur_backend_id = -1;
|
| 1268 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1269 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1270 |
if (ggml_is_view_op(node->op)) {
|
| 1271 |
continue;
|
|
|
|
| 1280 |
}
|
| 1281 |
} else {
|
| 1282 |
tensor_backend_id(node) = cur_backend_id;
|
| 1283 |
+
SET_CAUSE(node, "2.2");
|
| 1284 |
}
|
| 1285 |
}
|
| 1286 |
}
|
| 1287 |
|
| 1288 |
+
// pass 2.1 expand gpu up
|
| 1289 |
{
|
| 1290 |
int cur_backend_id = -1;
|
| 1291 |
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1292 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1293 |
if (ggml_is_view_op(node->op)) {
|
| 1294 |
continue;
|
|
|
|
| 1303 |
}
|
| 1304 |
} else {
|
| 1305 |
tensor_backend_id(node) = cur_backend_id;
|
| 1306 |
+
SET_CAUSE(node, "2.1");
|
| 1307 |
}
|
| 1308 |
}
|
| 1309 |
}
|
| 1310 |
|
| 1311 |
+
|
| 1312 |
+
// pass 2.4 expand rest down
|
| 1313 |
{
|
| 1314 |
int cur_backend_id = -1;
|
| 1315 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1316 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1317 |
if (ggml_is_view_op(node->op)) {
|
| 1318 |
continue;
|
|
|
|
| 1322 |
cur_backend_id = tensor_backend_id;
|
| 1323 |
} else {
|
| 1324 |
tensor_backend_id(node) = cur_backend_id;
|
| 1325 |
+
SET_CAUSE(node, "2.4");
|
| 1326 |
}
|
| 1327 |
}
|
| 1328 |
}
|
| 1329 |
+
// pass 2.3 expand rest up
|
|
|
|
| 1330 |
{
|
| 1331 |
int cur_backend_id = -1;
|
| 1332 |
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1333 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1334 |
if (ggml_is_view_op(node->op)) {
|
| 1335 |
continue;
|
|
|
|
| 1339 |
cur_backend_id = tensor_backend_id;
|
| 1340 |
} else {
|
| 1341 |
tensor_backend_id(node) = cur_backend_id;
|
| 1342 |
+
SET_CAUSE(node, "2.3");
|
| 1343 |
}
|
| 1344 |
}
|
| 1345 |
}
|
| 1346 |
+
|
| 1347 |
#ifdef DEBUG_PASS2
|
| 1348 |
+
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
| 1349 |
#endif
|
| 1350 |
|
| 1351 |
// pass 3: assign backends to remaining src from dst and view_src
|
|
|
|
| 1375 |
}
|
| 1376 |
}
|
| 1377 |
#ifdef DEBUG_PASS3
|
| 1378 |
+
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
| 1379 |
#endif
|
| 1380 |
|
| 1381 |
// pass 4: split graph, find tensors that need to be copied
|
|
|
|
| 1407 |
if (tensor_backend_id != cur_backend_id) {
|
| 1408 |
sched->splits[cur_split].i_end = i;
|
| 1409 |
cur_split++;
|
| 1410 |
+
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
|
| 1411 |
sched->splits[cur_split].backend_id = tensor_backend_id;
|
| 1412 |
sched->splits[cur_split].i_start = i;
|
| 1413 |
sched->splits[cur_split].n_inputs = 0;
|
|
|
|
| 1420 |
if (src == NULL) {
|
| 1421 |
continue;
|
| 1422 |
}
|
| 1423 |
+
|
| 1424 |
int src_backend_id = tensor_backend_id(src);
|
| 1425 |
assert(src_backend_id != -1); // all inputs should be assigned by now
|
| 1426 |
+
|
| 1427 |
+
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1428 |
+
size_t id = hash_id(src);
|
| 1429 |
+
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
| 1430 |
+
ggml_backend_t backend = sched->backends[src_backend_id];
|
| 1431 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 1432 |
+
struct ggml_tensor * tensor_copy;
|
| 1433 |
+
if (c == sched->cur_copy) {
|
| 1434 |
+
tensor_copy = src; // use the original tensor as the current copy
|
| 1435 |
+
} else {
|
| 1436 |
+
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1437 |
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
| 1438 |
+
}
|
| 1439 |
+
if (sched->n_copies > 1) {
|
| 1440 |
+
ggml_set_input(tensor_copy);
|
| 1441 |
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
| 1442 |
+
}
|
| 1443 |
+
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
| 1444 |
+
tensor_backend_id(tensor_copy) = src_backend_id;
|
| 1445 |
+
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1446 |
+
}
|
| 1447 |
+
int n_graph_inputs = sched->n_graph_inputs++;
|
| 1448 |
+
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
| 1449 |
+
sched->graph_inputs[n_graph_inputs] = src;
|
| 1450 |
+
}
|
| 1451 |
+
}
|
| 1452 |
+
|
| 1453 |
if (src_backend_id != tensor_backend_id) {
|
| 1454 |
// create a copy of the input in the split's backend
|
| 1455 |
size_t id = hash_id(src);
|
| 1456 |
+
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
| 1457 |
ggml_backend_t backend = sched->backends[cur_backend_id];
|
| 1458 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 1459 |
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1460 |
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
| 1461 |
+
if (sched->n_copies > 1) {
|
| 1462 |
+
ggml_set_input(tensor_copy);
|
| 1463 |
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
| 1464 |
+
}
|
| 1465 |
+
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
| 1466 |
+
tensor_backend_id(tensor_copy) = cur_backend_id;
|
| 1467 |
+
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1468 |
+
}
|
| 1469 |
int n_inputs = sched->splits[cur_split].n_inputs++;
|
| 1470 |
+
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
| 1471 |
sched->splits[cur_split].inputs[n_inputs] = src;
|
| 1472 |
}
|
| 1473 |
+
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
| 1474 |
}
|
| 1475 |
}
|
| 1476 |
}
|
|
|
|
| 1478 |
sched->n_splits = cur_split + 1;
|
| 1479 |
}
|
| 1480 |
#ifdef DEBUG_PASS4
|
| 1481 |
+
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
| 1482 |
#endif
|
| 1483 |
|
| 1484 |
#ifndef NDEBUG
|
| 1485 |
// sanity check: all sources should have the same backend as the node
|
| 1486 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1487 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1488 |
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
| 1489 |
if (tensor_backend == NULL) {
|
| 1490 |
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
| 1491 |
}
|
| 1492 |
+
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
| 1493 |
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
| 1494 |
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
| 1495 |
+
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
| 1496 |
+
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
| 1497 |
}
|
| 1498 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1499 |
struct ggml_tensor * src = node->src[j];
|
| 1500 |
if (src == NULL) {
|
| 1501 |
continue;
|
| 1502 |
}
|
| 1503 |
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
| 1504 |
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
| 1505 |
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
| 1506 |
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
| 1507 |
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
| 1508 |
}
|
| 1509 |
+
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
| 1510 |
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
| 1511 |
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
| 1512 |
+
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
| 1513 |
+
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
| 1514 |
}
|
| 1515 |
}
|
| 1516 |
}
|
|
|
|
| 1518 |
#endif
|
| 1519 |
|
| 1520 |
// create copies of the graph for each split
|
| 1521 |
+
// TODO: avoid this copy
|
| 1522 |
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
| 1523 |
for (int i = 0; i < sched->n_splits; i++) {
|
| 1524 |
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1525 |
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
| 1526 |
|
| 1527 |
+
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
| 1528 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1529 |
struct ggml_tensor * input = split->inputs[j];
|
| 1530 |
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
|
| 1531 |
|
| 1532 |
// add a dependency to the input source so that it is not freed before the copy is done
|
| 1533 |
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
| 1534 |
+
input_dep->src[0] = input;
|
| 1535 |
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
| 1536 |
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
| 1537 |
|
|
|
|
| 1545 |
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
| 1546 |
}
|
| 1547 |
}
|
| 1548 |
+
|
| 1549 |
+
if (sched->n_copies > 1) {
|
| 1550 |
+
// add input copies as leafs so that they are allocated first
|
| 1551 |
+
for (int i = 0; i < sched->n_graph_inputs; i++) {
|
| 1552 |
+
struct ggml_tensor * input = sched->graph_inputs[i];
|
| 1553 |
+
size_t id = hash_id(input);
|
| 1554 |
+
int backend_id = tensor_backend_id(input);
|
| 1555 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 1556 |
+
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
| 1557 |
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
| 1558 |
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
| 1559 |
+
}
|
| 1560 |
+
}
|
| 1561 |
+
|
| 1562 |
+
for (int i = 0; i < sched->n_splits; i++) {
|
| 1563 |
+
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1564 |
+
int backend_id = split->backend_id;
|
| 1565 |
+
for (int j = 0; j < split->n_inputs; j++) {
|
| 1566 |
+
struct ggml_tensor * input = split->inputs[j];
|
| 1567 |
+
size_t id = hash_id(input);
|
| 1568 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 1569 |
+
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
| 1570 |
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
| 1571 |
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
| 1572 |
+
}
|
| 1573 |
+
}
|
| 1574 |
+
}
|
| 1575 |
+
}
|
| 1576 |
+
|
| 1577 |
+
// add leafs from the original graph
|
| 1578 |
+
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1579 |
+
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1580 |
+
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
| 1581 |
+
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
| 1582 |
+
}
|
| 1583 |
+
|
| 1584 |
sched->graph = graph_copy;
|
| 1585 |
}
|
| 1586 |
|
| 1587 |
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
| 1588 |
+
// allocate graph
|
| 1589 |
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
| 1590 |
+
// the re-allocation may cause the split inputs to be moved to a different address
|
| 1591 |
+
ggml_backend_sched_synchronize(sched);
|
| 1592 |
#ifndef NDEBUG
|
| 1593 |
+
fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
|
| 1594 |
#endif
|
| 1595 |
+
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
| 1596 |
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
| 1597 |
+
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
| 1598 |
return false;
|
| 1599 |
}
|
| 1600 |
}
|
|
|
|
| 1603 |
}
|
| 1604 |
|
| 1605 |
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
|
|
|
|
|
|
|
|
| 1606 |
struct ggml_backend_sched_split * splits = sched->splits;
|
| 1607 |
|
| 1608 |
for (int i = 0; i < sched->n_splits; i++) {
|
|
|
|
| 1611 |
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
| 1612 |
|
| 1613 |
// copy the input tensors to the split backend
|
|
|
|
| 1614 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1615 |
+
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
| 1616 |
struct ggml_tensor * input = split->inputs[j];
|
| 1617 |
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
|
| 1618 |
|
| 1619 |
+
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1620 |
+
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
| 1621 |
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 1622 |
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
| 1623 |
+
} else {
|
| 1624 |
+
ggml_backend_synchronize(split_backend);
|
| 1625 |
+
}
|
| 1626 |
+
ggml_backend_tensor_copy(input, input_cpy);
|
| 1627 |
+
} else {
|
| 1628 |
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 1629 |
+
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
| 1630 |
+
} else {
|
| 1631 |
+
ggml_backend_synchronize(split_backend);
|
| 1632 |
+
ggml_backend_synchronize(input_backend);
|
| 1633 |
+
}
|
| 1634 |
|
| 1635 |
+
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
| 1636 |
+
}
|
| 1637 |
}
|
|
|
|
|
|
|
|
|
|
| 1638 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1639 |
if (!sched->callback_eval) {
|
| 1640 |
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
| 1641 |
if (ec != GGML_STATUS_SUCCESS) {
|
| 1642 |
return ec;
|
| 1643 |
}
|
|
|
|
| 1644 |
} else {
|
| 1645 |
// similar to ggml_backend_compare_graph_backend
|
| 1646 |
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
|
|
|
| 1659 |
|
| 1660 |
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
| 1661 |
|
| 1662 |
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
|
| 1663 |
if (ec != GGML_STATUS_SUCCESS) {
|
| 1664 |
return ec;
|
| 1665 |
}
|
| 1666 |
|
| 1667 |
+
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
| 1668 |
+
ggml_backend_synchronize(split_backend);
|
| 1669 |
+
|
| 1670 |
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
| 1671 |
break;
|
| 1672 |
}
|
|
|
|
| 1674 |
j0 = j1;
|
| 1675 |
}
|
| 1676 |
}
|
|
|
|
|
|
|
|
|
|
| 1677 |
|
| 1678 |
+
// record the event of this copy
|
| 1679 |
+
if (split->n_inputs > 0) {
|
| 1680 |
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 1681 |
+
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
|
| 1682 |
+
}
|
|
|
|
| 1683 |
}
|
| 1684 |
}
|
| 1685 |
+
|
| 1686 |
+
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
| 1687 |
|
| 1688 |
return GGML_STATUS_SUCCESS;
|
| 1689 |
}
|
| 1690 |
|
| 1691 |
+
ggml_backend_sched_t ggml_backend_sched_new(
|
| 1692 |
+
ggml_backend_t * backends,
|
| 1693 |
+
ggml_backend_buffer_type_t * bufts,
|
| 1694 |
+
int n_backends,
|
| 1695 |
+
size_t graph_size,
|
| 1696 |
+
bool parallel) {
|
| 1697 |
GGML_ASSERT(n_backends > 0);
|
| 1698 |
+
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
| 1699 |
+
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
| 1700 |
|
| 1701 |
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
| 1702 |
|
| 1703 |
// initialize hash table
|
| 1704 |
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
| 1705 |
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
| 1706 |
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
| 1707 |
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
| 1708 |
+
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
|
| 1709 |
|
| 1710 |
sched->n_backends = n_backends;
|
| 1711 |
+
|
| 1712 |
+
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
| 1713 |
+
|
| 1714 |
+
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
|
| 1715 |
+
|
| 1716 |
+
for (int b = 0; b < n_backends; b++) {
|
| 1717 |
+
sched->backends[b] = backends[b];
|
| 1718 |
+
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
| 1719 |
+
GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
|
| 1720 |
+
if (sched->n_copies > 1) {
|
| 1721 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 1722 |
+
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
| 1723 |
+
}
|
| 1724 |
+
}
|
| 1725 |
}
|
| 1726 |
|
| 1727 |
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
|
|
|
| 1735 |
if (sched == NULL) {
|
| 1736 |
return;
|
| 1737 |
}
|
| 1738 |
+
for (int b = 0; b < sched->n_backends; b++) {
|
| 1739 |
+
for (int c = 0; c < sched->n_copies; c++) {
|
| 1740 |
+
ggml_backend_event_free(sched->events[b][c]);
|
| 1741 |
+
}
|
| 1742 |
+
}
|
| 1743 |
ggml_gallocr_free(sched->galloc);
|
| 1744 |
ggml_free(sched->ctx);
|
| 1745 |
free(sched->hash_set.keys);
|
| 1746 |
free(sched->tensor_backend_id);
|
| 1747 |
free(sched->tensor_copies);
|
| 1748 |
free(sched->node_backend_ids);
|
| 1749 |
+
free(sched->leaf_backend_ids);
|
| 1750 |
free(sched);
|
| 1751 |
}
|
| 1752 |
|
|
|
|
| 1758 |
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
| 1759 |
|
| 1760 |
sched->is_reset = true;
|
| 1761 |
+
sched->is_alloc = false;
|
| 1762 |
}
|
| 1763 |
|
| 1764 |
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
| 1765 |
ggml_backend_sched_split_graph(sched, measure_graph);
|
| 1766 |
|
| 1767 |
+
// TODO: extract this to a separate function
|
| 1768 |
+
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
| 1769 |
return false;
|
| 1770 |
}
|
| 1771 |
|
| 1772 |
ggml_backend_sched_reset(sched);
|
| 1773 |
+
ggml_backend_sched_synchronize(sched);
|
| 1774 |
+
|
| 1775 |
+
return true;
|
| 1776 |
+
}
|
| 1777 |
+
|
| 1778 |
+
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1779 |
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
| 1780 |
+
|
| 1781 |
+
ggml_backend_sched_split_graph(sched, graph);
|
| 1782 |
+
|
| 1783 |
+
if (!ggml_backend_sched_alloc_splits(sched)) {
|
| 1784 |
+
return false;
|
| 1785 |
+
}
|
| 1786 |
+
|
| 1787 |
+
sched->is_alloc = true;
|
| 1788 |
+
|
| 1789 |
return true;
|
| 1790 |
}
|
| 1791 |
|
| 1792 |
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1793 |
+
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
|
| 1794 |
+
ggml_backend_sched_synchronize(sched);
|
| 1795 |
+
return err;
|
| 1796 |
+
}
|
| 1797 |
|
| 1798 |
+
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1799 |
+
if (!sched->is_reset && !sched->is_alloc) {
|
| 1800 |
ggml_backend_sched_reset(sched);
|
| 1801 |
}
|
| 1802 |
|
| 1803 |
+
if (!sched->is_alloc) {
|
| 1804 |
+
if (!ggml_backend_sched_alloc_graph(sched, graph)) {
|
| 1805 |
+
return GGML_STATUS_ALLOC_FAILED;
|
| 1806 |
+
}
|
| 1807 |
}
|
| 1808 |
|
| 1809 |
return ggml_backend_sched_compute_splits(sched);
|
| 1810 |
}
|
| 1811 |
|
| 1812 |
+
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
| 1813 |
+
for (int i = 0; i < sched->n_backends; i++) {
|
| 1814 |
+
ggml_backend_synchronize(sched->backends[i]);
|
| 1815 |
+
}
|
| 1816 |
+
}
|
| 1817 |
+
|
| 1818 |
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
| 1819 |
sched->callback_eval = callback;
|
| 1820 |
sched->callback_eval_user_data = user_data;
|
|
|
|
| 1824 |
return sched->n_splits;
|
| 1825 |
}
|
| 1826 |
|
| 1827 |
+
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
| 1828 |
+
return sched->n_copies;
|
| 1829 |
+
}
|
| 1830 |
+
|
| 1831 |
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 1832 |
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 1833 |
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1834 |
+
|
| 1835 |
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
| 1836 |
}
|
| 1837 |
|
| 1838 |
+
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
| 1839 |
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 1840 |
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1841 |
tensor_backend_id(node) = backend_index;
|
| 1842 |
}
|
| 1843 |
|
| 1844 |
+
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
| 1845 |
int backend_index = tensor_backend_id(node);
|
| 1846 |
if (backend_index == -1) {
|
| 1847 |
return NULL;
|
|
@@ -9,6 +9,7 @@ extern "C" {
|
|
| 9 |
|
| 10 |
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
| 11 |
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
|
|
|
| 12 |
typedef struct ggml_backend * ggml_backend_t;
|
| 13 |
typedef void * ggml_backend_graph_plan_t;
|
| 14 |
|
|
@@ -72,11 +73,24 @@ extern "C" {
|
|
| 72 |
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 73 |
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 74 |
|
|
|
|
| 75 |
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 76 |
|
| 77 |
// tensor copy between different backends
|
| 78 |
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
//
|
| 82 |
// CPU backend
|
|
@@ -123,27 +137,31 @@ extern "C" {
|
|
| 123 |
/*
|
| 124 |
Example usage:
|
| 125 |
|
| 126 |
-
|
| 127 |
-
//
|
|
|
|
| 128 |
|
| 129 |
-
|
| 130 |
-
measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
|
| 131 |
|
| 132 |
-
//
|
| 133 |
-
build_graph(
|
| 134 |
-
// manually assign nodes to a backend (optional, should not be needed in most cases)
|
| 135 |
-
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
| 136 |
-
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
|
| 137 |
-
}
|
| 138 |
|
| 139 |
-
//
|
| 140 |
-
|
|
|
|
| 141 |
|
| 142 |
-
|
| 143 |
|
| 144 |
// compute
|
| 145 |
graph = build_graph(sched);
|
| 146 |
ggml_backend_sched_graph_compute(sched, graph);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
*/
|
| 148 |
|
| 149 |
struct ggml_backend_sched;
|
|
@@ -158,20 +176,26 @@ extern "C" {
|
|
| 158 |
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
| 159 |
|
| 160 |
// Initialize a backend scheduler
|
| 161 |
-
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
| 162 |
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
|
|
| 163 |
// Initialize backend buffers from a measure graph
|
| 164 |
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
|
|
|
| 165 |
// Get the number of splits of the last graph
|
| 166 |
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
|
|
|
| 167 |
|
| 168 |
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
| 169 |
|
| 170 |
-
GGML_API void
|
| 171 |
-
GGML_API ggml_backend_t
|
| 172 |
|
| 173 |
// Allocate and compute graph on the backend scheduler
|
|
|
|
| 174 |
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
|
|
|
|
|
| 175 |
|
| 176 |
// Reset all assignments and allocators - must be called before changing the node backends
|
| 177 |
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
|
|
|
| 9 |
|
| 10 |
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
| 11 |
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
| 12 |
+
typedef struct ggml_backend_event * ggml_backend_event_t;
|
| 13 |
typedef struct ggml_backend * ggml_backend_t;
|
| 14 |
typedef void * ggml_backend_graph_plan_t;
|
| 15 |
|
|
|
|
| 73 |
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 74 |
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 75 |
|
| 76 |
+
GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 77 |
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 78 |
|
| 79 |
// tensor copy between different backends
|
| 80 |
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 81 |
+
|
| 82 |
+
// asynchronous copy
|
| 83 |
+
// the copy is performed after all the currently queued operations in backend_src
|
| 84 |
+
// backend_dst will wait for the copy to complete before performing other operations
|
| 85 |
+
// automatic fallback to sync copy if async is not supported
|
| 86 |
+
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 87 |
+
|
| 88 |
+
// events
|
| 89 |
+
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
|
| 90 |
+
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
| 91 |
+
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
| 92 |
+
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
| 93 |
+
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
|
| 94 |
|
| 95 |
//
|
| 96 |
// CPU backend
|
|
|
|
| 137 |
/*
|
| 138 |
Example usage:
|
| 139 |
|
| 140 |
+
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
|
| 141 |
+
// preferrably to run on the same backend as the buffer
|
| 142 |
+
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
| 143 |
|
| 144 |
+
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
|
|
|
|
| 145 |
|
| 146 |
+
// initialize buffers from a max size graph (optional)
|
| 147 |
+
reserve_graph = build_graph(sched, max_batch_size);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
+
// manually assign nodes to a backend (optional, should not be needed in most cases)
|
| 150 |
+
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
| 151 |
+
ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
|
| 152 |
|
| 153 |
+
ggml_backend_sched_reserve(sched, reserve_graph);
|
| 154 |
|
| 155 |
// compute
|
| 156 |
graph = build_graph(sched);
|
| 157 |
ggml_backend_sched_graph_compute(sched, graph);
|
| 158 |
+
|
| 159 |
+
// if there are graph inputs:
|
| 160 |
+
ggml_backend_sched_reset(sched);
|
| 161 |
+
ggml_backend_sched_alloc_graph(sched, graph);
|
| 162 |
+
ggml_backend_tensor_set(input_tensor, ...);
|
| 163 |
+
ggml_backend_sched_graph_compute(sched, graph);
|
| 164 |
+
}
|
| 165 |
*/
|
| 166 |
|
| 167 |
struct ggml_backend_sched;
|
|
|
|
| 176 |
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
| 177 |
|
| 178 |
// Initialize a backend scheduler
|
| 179 |
+
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
| 180 |
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
| 181 |
+
|
| 182 |
// Initialize backend buffers from a measure graph
|
| 183 |
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
| 184 |
+
|
| 185 |
// Get the number of splits of the last graph
|
| 186 |
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
| 187 |
+
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
| 188 |
|
| 189 |
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
| 190 |
|
| 191 |
+
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
| 192 |
+
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
| 193 |
|
| 194 |
// Allocate and compute graph on the backend scheduler
|
| 195 |
+
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
| 196 |
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
| 197 |
+
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
| 198 |
+
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
| 199 |
|
| 200 |
// Reset all assignments and allocators - must be called before changing the node backends
|
| 201 |
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
|
@@ -72,6 +72,7 @@
|
|
| 72 |
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
| 73 |
#define cudaEventDisableTiming hipEventDisableTiming
|
| 74 |
#define cudaEventRecord hipEventRecord
|
|
|
|
| 75 |
#define cudaEvent_t hipEvent_t
|
| 76 |
#define cudaEventDestroy hipEventDestroy
|
| 77 |
#define cudaFree hipFree
|
|
@@ -81,6 +82,7 @@
|
|
| 81 |
#define cudaGetDeviceProperties hipGetDeviceProperties
|
| 82 |
#define cudaGetErrorString hipGetErrorString
|
| 83 |
#define cudaGetLastError hipGetLastError
|
|
|
|
| 84 |
#ifdef GGML_HIP_UMA
|
| 85 |
#define cudaMalloc hipMallocManaged
|
| 86 |
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
|
@@ -104,6 +106,7 @@
|
|
| 104 |
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
| 105 |
#define cudaStreamFireAndForget hipStreamFireAndForget
|
| 106 |
#define cudaStreamNonBlocking hipStreamNonBlocking
|
|
|
|
| 107 |
#define cudaStreamSynchronize hipStreamSynchronize
|
| 108 |
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
| 109 |
#define cudaStream_t hipStream_t
|
|
@@ -10641,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description,
|
|
| 10641 |
#define UNUSED GGML_UNUSED
|
| 10642 |
|
| 10643 |
struct ggml_backend_cuda_context {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10644 |
int device;
|
| 10645 |
std::string name;
|
|
|
|
| 10646 |
};
|
| 10647 |
|
| 10648 |
// cuda buffer
|
|
@@ -10732,9 +10747,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
|
|
| 10732 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10733 |
|
| 10734 |
ggml_cuda_set_device(ctx->device);
|
| 10735 |
-
CUDA_CHECK(
|
| 10736 |
-
CUDA_CHECK(
|
| 10737 |
-
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10738 |
}
|
| 10739 |
|
| 10740 |
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
@@ -10743,26 +10757,25 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
|
|
| 10743 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10744 |
|
| 10745 |
ggml_cuda_set_device(ctx->device);
|
| 10746 |
-
CUDA_CHECK(
|
| 10747 |
-
CUDA_CHECK(
|
| 10748 |
-
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10749 |
}
|
| 10750 |
|
| 10751 |
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
| 10752 |
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
| 10753 |
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
| 10754 |
-
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10755 |
-
|
| 10756 |
-
|
| 10757 |
-
|
| 10758 |
-
|
| 10759 |
-
|
| 10760 |
-
CUDA_CHECK(
|
| 10761 |
-
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10762 |
-
|
| 10763 |
return true;
|
| 10764 |
}
|
| 10765 |
return false;
|
|
|
|
|
|
|
| 10766 |
}
|
| 10767 |
|
| 10768 |
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
@@ -11007,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
|
|
| 11007 |
}
|
| 11008 |
|
| 11009 |
const char * buf_host = (const char *)data + offset_split;
|
| 11010 |
-
CUDA_CHECK(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11011 |
}
|
| 11012 |
}
|
| 11013 |
|
|
@@ -11041,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
|
|
| 11041 |
}
|
| 11042 |
|
| 11043 |
char * buf_host = (char *)data + offset_split;
|
| 11044 |
-
CUDA_CHECK(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11045 |
}
|
| 11046 |
}
|
| 11047 |
|
|
@@ -11220,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
| 11220 |
return &ggml_backend_cuda_buffer_type_host;
|
| 11221 |
}
|
| 11222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11223 |
// backend
|
| 11224 |
|
| 11225 |
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
|
@@ -11243,8 +11268,9 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer
|
|
| 11243 |
|
| 11244 |
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 11245 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
| 11246 |
|
| 11247 |
-
GGML_ASSERT(
|
| 11248 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
| 11249 |
|
| 11250 |
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
|
@@ -11252,22 +11278,61 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
|
|
| 11252 |
|
| 11253 |
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 11254 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
| 11255 |
|
| 11256 |
-
GGML_ASSERT(
|
| 11257 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
| 11258 |
|
| 11259 |
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
| 11260 |
}
|
| 11261 |
|
| 11262 |
-
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t
|
| 11263 |
-
|
| 11264 |
|
| 11265 |
-
|
| 11266 |
-
|
| 11267 |
-
|
|
|
|
|
|
|
| 11268 |
}
|
| 11269 |
|
| 11270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11271 |
}
|
| 11272 |
|
| 11273 |
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
@@ -11444,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 11444 |
UNUSED(backend);
|
| 11445 |
}
|
| 11446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11447 |
static ggml_backend_i ggml_backend_cuda_interface = {
|
| 11448 |
/* .get_name = */ ggml_backend_cuda_name,
|
| 11449 |
/* .free = */ ggml_backend_cuda_free,
|
|
@@ -11457,6 +11568,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
|
| 11457 |
/* .graph_plan_compute = */ NULL,
|
| 11458 |
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
| 11459 |
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11460 |
};
|
| 11461 |
|
| 11462 |
static ggml_guid_t ggml_backend_cuda_guid() {
|
|
@@ -11475,10 +11591,11 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
| 11475 |
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
| 11476 |
ggml_cuda_set_main_device(device);
|
| 11477 |
|
| 11478 |
-
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context
|
| 11479 |
-
|
| 11480 |
-
|
| 11481 |
-
|
|
|
|
| 11482 |
|
| 11483 |
ggml_backend_t cuda_backend = new ggml_backend {
|
| 11484 |
/* .guid = */ ggml_backend_cuda_guid(),
|
|
|
|
| 72 |
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
| 73 |
#define cudaEventDisableTiming hipEventDisableTiming
|
| 74 |
#define cudaEventRecord hipEventRecord
|
| 75 |
+
#define cudaEventSynchronize hipEventSynchronize
|
| 76 |
#define cudaEvent_t hipEvent_t
|
| 77 |
#define cudaEventDestroy hipEventDestroy
|
| 78 |
#define cudaFree hipFree
|
|
|
|
| 82 |
#define cudaGetDeviceProperties hipGetDeviceProperties
|
| 83 |
#define cudaGetErrorString hipGetErrorString
|
| 84 |
#define cudaGetLastError hipGetLastError
|
| 85 |
+
#define cudaLaunchHostFunc hipLaunchHostFunc
|
| 86 |
#ifdef GGML_HIP_UMA
|
| 87 |
#define cudaMalloc hipMallocManaged
|
| 88 |
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
|
|
|
| 106 |
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
| 107 |
#define cudaStreamFireAndForget hipStreamFireAndForget
|
| 108 |
#define cudaStreamNonBlocking hipStreamNonBlocking
|
| 109 |
+
#define cudaStreamPerThread hipStreamPerThread
|
| 110 |
#define cudaStreamSynchronize hipStreamSynchronize
|
| 111 |
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
| 112 |
#define cudaStream_t hipStream_t
|
|
|
|
| 10644 |
#define UNUSED GGML_UNUSED
|
| 10645 |
|
| 10646 |
struct ggml_backend_cuda_context {
|
| 10647 |
+
explicit ggml_backend_cuda_context(int device) :
|
| 10648 |
+
device(device),
|
| 10649 |
+
name(GGML_CUDA_NAME + std::to_string(device)) {
|
| 10650 |
+
}
|
| 10651 |
+
|
| 10652 |
+
~ggml_backend_cuda_context() {
|
| 10653 |
+
if (copy_event != nullptr) {
|
| 10654 |
+
CUDA_CHECK(cudaEventDestroy(copy_event));
|
| 10655 |
+
}
|
| 10656 |
+
}
|
| 10657 |
+
|
| 10658 |
int device;
|
| 10659 |
std::string name;
|
| 10660 |
+
cudaEvent_t copy_event = nullptr;
|
| 10661 |
};
|
| 10662 |
|
| 10663 |
// cuda buffer
|
|
|
|
| 10747 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10748 |
|
| 10749 |
ggml_cuda_set_device(ctx->device);
|
| 10750 |
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
| 10751 |
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
|
|
| 10752 |
}
|
| 10753 |
|
| 10754 |
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
| 10757 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10758 |
|
| 10759 |
ggml_cuda_set_device(ctx->device);
|
| 10760 |
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
| 10761 |
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
|
|
| 10762 |
}
|
| 10763 |
|
| 10764 |
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
| 10765 |
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
| 10766 |
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
| 10767 |
+
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
|
| 10768 |
+
if (src_ctx->device == dst_ctx->device) {
|
| 10769 |
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
|
| 10770 |
+
} else {
|
| 10771 |
+
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
|
| 10772 |
+
}
|
| 10773 |
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
|
|
|
|
|
| 10774 |
return true;
|
| 10775 |
}
|
| 10776 |
return false;
|
| 10777 |
+
|
| 10778 |
+
UNUSED(buffer);
|
| 10779 |
}
|
| 10780 |
|
| 10781 |
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
|
|
| 11020 |
}
|
| 11021 |
|
| 11022 |
const char * buf_host = (const char *)data + offset_split;
|
| 11023 |
+
CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
| 11024 |
+
}
|
| 11025 |
+
|
| 11026 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 11027 |
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
| 11028 |
}
|
| 11029 |
}
|
| 11030 |
|
|
|
|
| 11058 |
}
|
| 11059 |
|
| 11060 |
char * buf_host = (char *)data + offset_split;
|
| 11061 |
+
CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
| 11062 |
+
}
|
| 11063 |
+
|
| 11064 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 11065 |
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
| 11066 |
}
|
| 11067 |
}
|
| 11068 |
|
|
|
|
| 11241 |
return &ggml_backend_cuda_buffer_type_host;
|
| 11242 |
}
|
| 11243 |
|
| 11244 |
+
//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
|
| 11245 |
+
// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
|
| 11246 |
+
//}
|
| 11247 |
+
|
| 11248 |
// backend
|
| 11249 |
|
| 11250 |
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
|
|
|
| 11268 |
|
| 11269 |
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 11270 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 11271 |
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 11272 |
|
| 11273 |
+
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
| 11274 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
| 11275 |
|
| 11276 |
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
|
|
|
| 11278 |
|
| 11279 |
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 11280 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 11281 |
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 11282 |
|
| 11283 |
+
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
| 11284 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
| 11285 |
|
| 11286 |
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
| 11287 |
}
|
| 11288 |
|
| 11289 |
+
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
| 11290 |
+
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
|
| 11291 |
|
| 11292 |
+
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
| 11293 |
+
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
| 11294 |
+
|
| 11295 |
+
if (!ggml_backend_buffer_is_cuda(src->buffer)) {
|
| 11296 |
+
return false;
|
| 11297 |
}
|
| 11298 |
|
| 11299 |
+
if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
|
| 11300 |
+
return false;
|
| 11301 |
+
}
|
| 11302 |
+
|
| 11303 |
+
// device -> device
|
| 11304 |
+
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
| 11305 |
+
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
| 11306 |
+
|
| 11307 |
+
if (backend_src != backend_dst) {
|
| 11308 |
+
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
| 11309 |
+
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
| 11310 |
+
|
| 11311 |
+
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
| 11312 |
+
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
| 11313 |
+
|
| 11314 |
+
if (!cuda_ctx_src->copy_event) {
|
| 11315 |
+
ggml_cuda_set_device(cuda_ctx_src->device);
|
| 11316 |
+
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
| 11317 |
+
}
|
| 11318 |
+
|
| 11319 |
+
// copy on src stream
|
| 11320 |
+
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
| 11321 |
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
|
| 11322 |
+
} else {
|
| 11323 |
+
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0]));
|
| 11324 |
+
}
|
| 11325 |
+
|
| 11326 |
+
// record event on src stream
|
| 11327 |
+
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0]));
|
| 11328 |
+
|
| 11329 |
+
// wait on dst stream for the copy to complete
|
| 11330 |
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0));
|
| 11331 |
+
} else {
|
| 11332 |
+
// src and dst are on the same backend
|
| 11333 |
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
|
| 11334 |
+
}
|
| 11335 |
+
return true;
|
| 11336 |
}
|
| 11337 |
|
| 11338 |
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
|
|
| 11509 |
UNUSED(backend);
|
| 11510 |
}
|
| 11511 |
|
| 11512 |
+
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
| 11513 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 11514 |
+
|
| 11515 |
+
ggml_cuda_set_device(cuda_ctx->device);
|
| 11516 |
+
|
| 11517 |
+
cudaEvent_t event;
|
| 11518 |
+
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
| 11519 |
+
|
| 11520 |
+
return new ggml_backend_event {
|
| 11521 |
+
/* .backend = */ backend,
|
| 11522 |
+
/* .context = */ event,
|
| 11523 |
+
};
|
| 11524 |
+
}
|
| 11525 |
+
|
| 11526 |
+
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
|
| 11527 |
+
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
|
| 11528 |
+
|
| 11529 |
+
delete event;
|
| 11530 |
+
}
|
| 11531 |
+
|
| 11532 |
+
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
|
| 11533 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
|
| 11534 |
+
|
| 11535 |
+
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
|
| 11536 |
+
}
|
| 11537 |
+
|
| 11538 |
+
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
| 11539 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 11540 |
+
|
| 11541 |
+
if (ggml_backend_is_cuda(event->backend)) {
|
| 11542 |
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
|
| 11543 |
+
} else {
|
| 11544 |
+
// untested
|
| 11545 |
+
auto wait_fn = [](void * user_data) {
|
| 11546 |
+
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
| 11547 |
+
ggml_backend_event_synchronize(event);
|
| 11548 |
+
};
|
| 11549 |
+
|
| 11550 |
+
CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
|
| 11551 |
+
}
|
| 11552 |
+
}
|
| 11553 |
+
|
| 11554 |
+
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
|
| 11555 |
+
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
| 11556 |
+
}
|
| 11557 |
+
|
| 11558 |
static ggml_backend_i ggml_backend_cuda_interface = {
|
| 11559 |
/* .get_name = */ ggml_backend_cuda_name,
|
| 11560 |
/* .free = */ ggml_backend_cuda_free,
|
|
|
|
| 11568 |
/* .graph_plan_compute = */ NULL,
|
| 11569 |
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
| 11570 |
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
| 11571 |
+
/* .event_new = */ ggml_backend_cuda_event_new,
|
| 11572 |
+
/* .event_free = */ ggml_backend_cuda_event_free,
|
| 11573 |
+
/* .event_record = */ ggml_backend_cuda_event_record,
|
| 11574 |
+
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
| 11575 |
+
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
|
| 11576 |
};
|
| 11577 |
|
| 11578 |
static ggml_guid_t ggml_backend_cuda_guid() {
|
|
|
|
| 11591 |
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
| 11592 |
ggml_cuda_set_main_device(device);
|
| 11593 |
|
| 11594 |
+
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
| 11595 |
+
if (ctx == nullptr) {
|
| 11596 |
+
fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
|
| 11597 |
+
return nullptr;
|
| 11598 |
+
}
|
| 11599 |
|
| 11600 |
ggml_backend_t cuda_backend = new ggml_backend {
|
| 11601 |
/* .guid = */ ggml_backend_cuda_guid(),
|
|
@@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
| 1951 |
/* .graph_plan_compute = */ NULL,
|
| 1952 |
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
| 1953 |
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1954 |
};
|
| 1955 |
|
| 1956 |
static ggml_guid_t ggml_backend_kompute_guid() {
|
|
|
|
| 1951 |
/* .graph_plan_compute = */ NULL,
|
| 1952 |
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
| 1953 |
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
| 1954 |
+
/* .event_new = */ NULL,
|
| 1955 |
+
/* .event_free = */ NULL,
|
| 1956 |
+
/* .event_record = */ NULL,
|
| 1957 |
+
/* .event_wait = */ NULL,
|
| 1958 |
+
/* .event_synchronize = */ NULL,
|
| 1959 |
};
|
| 1960 |
|
| 1961 |
static ggml_guid_t ggml_backend_kompute_guid() {
|
|
@@ -2820,6 +2820,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|
| 2820 |
/* .graph_plan_compute = */ NULL,
|
| 2821 |
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
| 2822 |
/* .supports_op = */ ggml_backend_metal_supports_op,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2823 |
};
|
| 2824 |
|
| 2825 |
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
|
|
|
| 2820 |
/* .graph_plan_compute = */ NULL,
|
| 2821 |
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
| 2822 |
/* .supports_op = */ ggml_backend_metal_supports_op,
|
| 2823 |
+
/* .event_new = */ NULL,
|
| 2824 |
+
/* .event_free = */ NULL,
|
| 2825 |
+
/* .event_record = */ NULL,
|
| 2826 |
+
/* .event_wait = */ NULL,
|
| 2827 |
+
/* .event_synchronize = */ NULL,
|
| 2828 |
};
|
| 2829 |
|
| 2830 |
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
|
@@ -17249,13 +17249,18 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
| 17249 |
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
|
| 17250 |
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
| 17251 |
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
| 17252 |
-
/* .cpy_tensor_async = */ ggml_backend_sycl_cpy_tensor_async,
|
| 17253 |
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
| 17254 |
/* .graph_plan_create = */ NULL,
|
| 17255 |
/* .graph_plan_free = */ NULL,
|
| 17256 |
/* .graph_plan_compute = */ NULL,
|
| 17257 |
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
| 17258 |
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17259 |
};
|
| 17260 |
|
| 17261 |
static ggml_guid_t ggml_backend_sycl_guid() {
|
|
|
|
| 17249 |
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
|
| 17250 |
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
| 17251 |
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
| 17252 |
+
/* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
|
| 17253 |
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
| 17254 |
/* .graph_plan_create = */ NULL,
|
| 17255 |
/* .graph_plan_free = */ NULL,
|
| 17256 |
/* .graph_plan_compute = */ NULL,
|
| 17257 |
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
| 17258 |
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
| 17259 |
+
/* .event_new = */ NULL,
|
| 17260 |
+
/* .event_free = */ NULL,
|
| 17261 |
+
/* .event_record = */ NULL,
|
| 17262 |
+
/* .event_wait = */ NULL,
|
| 17263 |
+
/* .event_synchronize = */ NULL,
|
| 17264 |
};
|
| 17265 |
|
| 17266 |
static ggml_guid_t ggml_backend_sycl_guid() {
|
|
@@ -5693,6 +5693,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
| 5693 |
/* .graph_plan_compute = */ NULL,
|
| 5694 |
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
| 5695 |
/* .supports_op = */ ggml_backend_vk_supports_op,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5696 |
};
|
| 5697 |
|
| 5698 |
static ggml_guid_t ggml_backend_vk_guid() {
|
|
|
|
| 5693 |
/* .graph_plan_compute = */ NULL,
|
| 5694 |
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
| 5695 |
/* .supports_op = */ ggml_backend_vk_supports_op,
|
| 5696 |
+
/* .event_new = */ NULL,
|
| 5697 |
+
/* .event_free = */ NULL,
|
| 5698 |
+
/* .event_record = */ NULL,
|
| 5699 |
+
/* .event_wait = */ NULL,
|
| 5700 |
+
/* .event_synchronize = */ NULL,
|
| 5701 |
};
|
| 5702 |
|
| 5703 |
static ggml_guid_t ggml_backend_vk_guid() {
|
|
@@ -11560,8 +11560,6 @@ static void ggml_compute_forward_get_rows_q(
|
|
| 11560 |
const struct ggml_tensor * src0 = dst->src[0];
|
| 11561 |
const struct ggml_tensor * src1 = dst->src[1];
|
| 11562 |
|
| 11563 |
-
assert(params->ith == 0);
|
| 11564 |
-
|
| 11565 |
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
| 11566 |
return;
|
| 11567 |
}
|
|
@@ -11569,7 +11567,7 @@ static void ggml_compute_forward_get_rows_q(
|
|
| 11569 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 11570 |
|
| 11571 |
const int64_t nc = ne00;
|
| 11572 |
-
const int64_t nr = ggml_nelements(src1);
|
| 11573 |
|
| 11574 |
const enum ggml_type type = src0->type;
|
| 11575 |
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
|
@@ -11579,17 +11577,25 @@ static void ggml_compute_forward_get_rows_q(
|
|
| 11579 |
assert(nb00 == ggml_type_size(type));
|
| 11580 |
assert(ggml_nrows(dst) == nr);
|
| 11581 |
|
| 11582 |
-
|
| 11583 |
-
|
| 11584 |
-
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
| 11585 |
-
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
| 11586 |
-
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 11587 |
|
| 11588 |
-
|
| 11589 |
-
|
| 11590 |
-
|
| 11591 |
-
|
| 11592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11593 |
}
|
| 11594 |
}
|
| 11595 |
|
|
@@ -11600,8 +11606,6 @@ static void ggml_compute_forward_get_rows_f16(
|
|
| 11600 |
const struct ggml_tensor * src0 = dst->src[0];
|
| 11601 |
const struct ggml_tensor * src1 = dst->src[1];
|
| 11602 |
|
| 11603 |
-
assert(params->ith == 0);
|
| 11604 |
-
|
| 11605 |
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
| 11606 |
return;
|
| 11607 |
}
|
|
@@ -11609,24 +11613,32 @@ static void ggml_compute_forward_get_rows_f16(
|
|
| 11609 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 11610 |
|
| 11611 |
const int64_t nc = ne00;
|
| 11612 |
-
const int64_t nr = ggml_nelements(src1);
|
| 11613 |
|
| 11614 |
assert(ne0 == nc);
|
| 11615 |
assert(ne02 == ne11);
|
| 11616 |
assert(nb00 == sizeof(ggml_fp16_t));
|
| 11617 |
assert(ggml_nrows(dst) == nr);
|
| 11618 |
|
| 11619 |
-
|
| 11620 |
-
|
| 11621 |
-
|
| 11622 |
-
|
| 11623 |
-
|
| 11624 |
|
| 11625 |
-
|
| 11626 |
-
|
| 11627 |
-
|
| 11628 |
-
|
| 11629 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11630 |
}
|
| 11631 |
}
|
| 11632 |
|
|
@@ -11637,8 +11649,6 @@ static void ggml_compute_forward_get_rows_f32(
|
|
| 11637 |
const struct ggml_tensor * src0 = dst->src[0];
|
| 11638 |
const struct ggml_tensor * src1 = dst->src[1];
|
| 11639 |
|
| 11640 |
-
assert(params->ith == 0);
|
| 11641 |
-
|
| 11642 |
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
| 11643 |
return;
|
| 11644 |
}
|
|
@@ -11646,24 +11656,32 @@ static void ggml_compute_forward_get_rows_f32(
|
|
| 11646 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 11647 |
|
| 11648 |
const int64_t nc = ne00;
|
| 11649 |
-
const int64_t nr = ggml_nelements(src1);
|
| 11650 |
|
| 11651 |
assert(ne0 == nc);
|
| 11652 |
assert(ne02 == ne11);
|
| 11653 |
assert(nb00 == sizeof(float));
|
| 11654 |
assert(ggml_nrows(dst) == nr);
|
| 11655 |
|
| 11656 |
-
|
| 11657 |
-
|
| 11658 |
-
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
| 11659 |
-
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
| 11660 |
-
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 11661 |
|
| 11662 |
-
|
| 11663 |
-
|
| 11664 |
-
|
| 11665 |
-
|
| 11666 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11667 |
}
|
| 11668 |
}
|
| 11669 |
|
|
@@ -17796,7 +17814,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|
| 17796 |
node->perf_time_us += time_us_cur;
|
| 17797 |
}
|
| 17798 |
|
| 17799 |
-
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
| 17800 |
int n_tasks = 0;
|
| 17801 |
|
| 17802 |
switch (node->op) {
|
|
@@ -17877,6 +17895,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
| 17877 |
{
|
| 17878 |
n_tasks = n_threads;
|
| 17879 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17880 |
case GGML_OP_SCALE:
|
| 17881 |
case GGML_OP_SET:
|
| 17882 |
case GGML_OP_CONT:
|
|
@@ -17884,7 +17908,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
| 17884 |
case GGML_OP_VIEW:
|
| 17885 |
case GGML_OP_PERMUTE:
|
| 17886 |
case GGML_OP_TRANSPOSE:
|
| 17887 |
-
case GGML_OP_GET_ROWS:
|
| 17888 |
case GGML_OP_GET_ROWS_BACK:
|
| 17889 |
case GGML_OP_DIAG:
|
| 17890 |
{
|
|
@@ -18102,7 +18125,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 18102 |
/* FINALIZE */
|
| 18103 |
struct ggml_tensor * node = cgraph->nodes[node_n];
|
| 18104 |
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
| 18105 |
-
params.nth = ggml_get_n_tasks(node, n_threads);
|
| 18106 |
ggml_compute_forward(¶ms, node);
|
| 18107 |
}
|
| 18108 |
ggml_graph_compute_perf_stats_node(node, state->shared);
|
|
@@ -18112,7 +18135,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 18112 |
while (++node_n < cgraph->n_nodes) {
|
| 18113 |
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
| 18114 |
struct ggml_tensor * node = cgraph->nodes[node_n];
|
| 18115 |
-
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
| 18116 |
|
| 18117 |
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
| 18118 |
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
|
@@ -18160,7 +18183,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 18160 |
|
| 18161 |
/* INIT & COMPUTE */
|
| 18162 |
struct ggml_tensor * node = cgraph->nodes[node_n];
|
| 18163 |
-
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
| 18164 |
|
| 18165 |
struct ggml_compute_params params = {
|
| 18166 |
/*.type =*/ GGML_TASK_TYPE_INIT,
|
|
@@ -18225,7 +18248,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
| 18225 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 18226 |
struct ggml_tensor * node = cgraph->nodes[i];
|
| 18227 |
|
| 18228 |
-
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
| 18229 |
|
| 18230 |
max_tasks = MAX(max_tasks, n_tasks);
|
| 18231 |
|
|
|
|
| 11560 |
const struct ggml_tensor * src0 = dst->src[0];
|
| 11561 |
const struct ggml_tensor * src1 = dst->src[1];
|
| 11562 |
|
|
|
|
|
|
|
| 11563 |
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
| 11564 |
return;
|
| 11565 |
}
|
|
|
|
| 11567 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 11568 |
|
| 11569 |
const int64_t nc = ne00;
|
| 11570 |
+
const int64_t nr = ggml_nelements(src1);
|
| 11571 |
|
| 11572 |
const enum ggml_type type = src0->type;
|
| 11573 |
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
|
|
|
| 11577 |
assert(nb00 == ggml_type_size(type));
|
| 11578 |
assert(ggml_nrows(dst) == nr);
|
| 11579 |
|
| 11580 |
+
const int ith = params->ith;
|
| 11581 |
+
const int nth = params->nth;
|
|
|
|
|
|
|
|
|
|
| 11582 |
|
| 11583 |
+
// rows per thread
|
| 11584 |
+
const int dr = (nr + nth - 1)/nth;
|
| 11585 |
+
|
| 11586 |
+
// row range for this thread
|
| 11587 |
+
const int ir0 = dr*ith;
|
| 11588 |
+
const int ir1 = MIN(ir0 + dr, nr);
|
| 11589 |
+
|
| 11590 |
+
for (int64_t i = ir0; i < ir1; ++i) {
|
| 11591 |
+
const int64_t i12 = i/(ne11*ne10);
|
| 11592 |
+
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
| 11593 |
+
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 11594 |
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 11595 |
+
|
| 11596 |
+
dequantize_row_q(
|
| 11597 |
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 11598 |
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
| 11599 |
}
|
| 11600 |
}
|
| 11601 |
|
|
|
|
| 11606 |
const struct ggml_tensor * src0 = dst->src[0];
|
| 11607 |
const struct ggml_tensor * src1 = dst->src[1];
|
| 11608 |
|
|
|
|
|
|
|
| 11609 |
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
| 11610 |
return;
|
| 11611 |
}
|
|
|
|
| 11613 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 11614 |
|
| 11615 |
const int64_t nc = ne00;
|
| 11616 |
+
const int64_t nr = ggml_nelements(src1);
|
| 11617 |
|
| 11618 |
assert(ne0 == nc);
|
| 11619 |
assert(ne02 == ne11);
|
| 11620 |
assert(nb00 == sizeof(ggml_fp16_t));
|
| 11621 |
assert(ggml_nrows(dst) == nr);
|
| 11622 |
|
| 11623 |
+
const int ith = params->ith;
|
| 11624 |
+
const int nth = params->nth;
|
| 11625 |
+
|
| 11626 |
+
// rows per thread
|
| 11627 |
+
const int dr = (nr + nth - 1)/nth;
|
| 11628 |
|
| 11629 |
+
// row range for this thread
|
| 11630 |
+
const int ir0 = dr*ith;
|
| 11631 |
+
const int ir1 = MIN(ir0 + dr, nr);
|
| 11632 |
+
|
| 11633 |
+
for (int64_t i = ir0; i < ir1; ++i) {
|
| 11634 |
+
const int64_t i12 = i/(ne11*ne10);
|
| 11635 |
+
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
| 11636 |
+
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 11637 |
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 11638 |
+
|
| 11639 |
+
ggml_fp16_to_fp32_row(
|
| 11640 |
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 11641 |
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
| 11642 |
}
|
| 11643 |
}
|
| 11644 |
|
|
|
|
| 11649 |
const struct ggml_tensor * src0 = dst->src[0];
|
| 11650 |
const struct ggml_tensor * src1 = dst->src[1];
|
| 11651 |
|
|
|
|
|
|
|
| 11652 |
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
| 11653 |
return;
|
| 11654 |
}
|
|
|
|
| 11656 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 11657 |
|
| 11658 |
const int64_t nc = ne00;
|
| 11659 |
+
const int64_t nr = ggml_nelements(src1);
|
| 11660 |
|
| 11661 |
assert(ne0 == nc);
|
| 11662 |
assert(ne02 == ne11);
|
| 11663 |
assert(nb00 == sizeof(float));
|
| 11664 |
assert(ggml_nrows(dst) == nr);
|
| 11665 |
|
| 11666 |
+
const int ith = params->ith;
|
| 11667 |
+
const int nth = params->nth;
|
|
|
|
|
|
|
|
|
|
| 11668 |
|
| 11669 |
+
// rows per thread
|
| 11670 |
+
const int dr = (nr + nth - 1)/nth;
|
| 11671 |
+
|
| 11672 |
+
// row range for this thread
|
| 11673 |
+
const int ir0 = dr*ith;
|
| 11674 |
+
const int ir1 = MIN(ir0 + dr, nr);
|
| 11675 |
+
|
| 11676 |
+
for (int64_t i = ir0; i < ir1; ++i) {
|
| 11677 |
+
const int64_t i12 = i/(ne11*ne10);
|
| 11678 |
+
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
| 11679 |
+
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 11680 |
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 11681 |
+
|
| 11682 |
+
ggml_vec_cpy_f32(nc,
|
| 11683 |
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
| 11684 |
+
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
| 11685 |
}
|
| 11686 |
}
|
| 11687 |
|
|
|
|
| 17814 |
node->perf_time_us += time_us_cur;
|
| 17815 |
}
|
| 17816 |
|
| 17817 |
+
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
|
| 17818 |
int n_tasks = 0;
|
| 17819 |
|
| 17820 |
switch (node->op) {
|
|
|
|
| 17895 |
{
|
| 17896 |
n_tasks = n_threads;
|
| 17897 |
} break;
|
| 17898 |
+
case GGML_OP_GET_ROWS:
|
| 17899 |
+
{
|
| 17900 |
+
// FIXME: the cost of launching additional threads decreases performance with GPU offloading
|
| 17901 |
+
//n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
|
| 17902 |
+
n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
|
| 17903 |
+
} break;
|
| 17904 |
case GGML_OP_SCALE:
|
| 17905 |
case GGML_OP_SET:
|
| 17906 |
case GGML_OP_CONT:
|
|
|
|
| 17908 |
case GGML_OP_VIEW:
|
| 17909 |
case GGML_OP_PERMUTE:
|
| 17910 |
case GGML_OP_TRANSPOSE:
|
|
|
|
| 17911 |
case GGML_OP_GET_ROWS_BACK:
|
| 17912 |
case GGML_OP_DIAG:
|
| 17913 |
{
|
|
|
|
| 18125 |
/* FINALIZE */
|
| 18126 |
struct ggml_tensor * node = cgraph->nodes[node_n];
|
| 18127 |
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
| 18128 |
+
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
| 18129 |
ggml_compute_forward(¶ms, node);
|
| 18130 |
}
|
| 18131 |
ggml_graph_compute_perf_stats_node(node, state->shared);
|
|
|
|
| 18135 |
while (++node_n < cgraph->n_nodes) {
|
| 18136 |
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
| 18137 |
struct ggml_tensor * node = cgraph->nodes[node_n];
|
| 18138 |
+
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
| 18139 |
|
| 18140 |
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
| 18141 |
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
|
|
|
| 18183 |
|
| 18184 |
/* INIT & COMPUTE */
|
| 18185 |
struct ggml_tensor * node = cgraph->nodes[node_n];
|
| 18186 |
+
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
| 18187 |
|
| 18188 |
struct ggml_compute_params params = {
|
| 18189 |
/*.type =*/ GGML_TASK_TYPE_INIT,
|
|
|
|
| 18248 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 18249 |
struct ggml_tensor * node = cgraph->nodes[i];
|
| 18250 |
|
| 18251 |
+
const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);
|
| 18252 |
|
| 18253 |
max_tasks = MAX(max_tasks, n_tasks);
|
| 18254 |
|