slaren compilade ggerganov commited on
Commit
b5bb3f3
·
unverified ·
1 Parent(s): 9a4e508

llama : add pipeline parallelism support (llama/6017)

Browse files

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <[email protected]>
Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (11) hide show
  1. ggml-alloc.c +45 -64
  2. ggml-alloc.h +13 -5
  3. ggml-backend-impl.h +14 -3
  4. ggml-backend.c +358 -135
  5. ggml-backend.h +41 -17
  6. ggml-cuda.cu +146 -29
  7. ggml-kompute.cpp +5 -0
  8. ggml-metal.m +5 -0
  9. ggml-sycl.cpp +6 -1
  10. ggml-vulkan.cpp +5 -0
  11. ggml.c +68 -45
ggml-alloc.c CHANGED
@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
61
  }
62
  }
63
 
64
- // TODO: GGML_PAD ?
65
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
66
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
67
  size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
69
  }
70
 
71
  // tallocr
72
- struct ggml_tallocr {
73
- ggml_backend_buffer_t buffer;
74
- void * base;
75
- size_t alignment;
76
- size_t offset;
77
- };
78
-
79
- ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
80
- ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
81
- if (talloc == NULL) {
82
- return NULL;
83
- }
84
 
 
85
  void * base = ggml_backend_buffer_get_base(buffer);
86
  size_t align = ggml_backend_buffer_get_alignment(buffer);
87
 
88
  assert(align && !(align & (align - 1))); // power of 2
89
 
90
- *talloc = (struct ggml_tallocr) {
91
  /*.buffer = */ buffer,
92
  /*.base = */ base,
93
  /*.alignment = */ align,
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
96
  return talloc;
97
  }
98
 
99
- void ggml_tallocr_free(ggml_tallocr_t talloc) {
100
- free(talloc);
101
- }
102
-
103
- void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
104
  size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
105
  size = GGML_PAD(size, talloc->alignment);
106
 
@@ -354,12 +338,16 @@ struct hash_node {
354
  bool allocated;
355
  };
356
 
357
- //
358
  struct tensor_alloc {
359
  size_t offset;
360
  size_t size_max; // 0 = pre-allocated, unused, or view
361
  };
362
 
 
 
 
 
 
363
  struct node_alloc {
364
  int buffer_id;
365
  struct tensor_alloc dst;
@@ -378,7 +366,7 @@ struct ggml_gallocr {
378
  struct node_alloc * node_allocs; // [n_nodes]
379
  int n_nodes;
380
 
381
- struct tensor_alloc * leaf_allocs; // [n_leafs]
382
  int n_leafs;
383
  };
384
 
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
543
  return node_buffer_ids ? node_buffer_ids[i] : 0;
544
  }
545
 
546
- static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
547
  // clear hash tables
548
  memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
549
  memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
550
 
 
 
 
 
 
 
 
551
  // count number of children and views
552
- // allocate all graph inputs and leafs first to avoid overwriting them
553
  for (int i = 0; i < graph->n_nodes; i++) {
554
  struct ggml_tensor * node = graph->nodes[i];
555
 
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
577
  }
578
  }
579
 
580
- // allocate the remaining leafs that are unused on the graph
581
- // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
582
- for (int i = 0; i < graph->n_leafs; i++) {
583
- struct ggml_tensor * leaf = graph->leafs[i];
584
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
585
-
586
- if (hn->n_children == 0) {
587
- assert(!hn->allocated);
588
- // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
589
- ggml_gallocr_allocate_node(galloc, leaf, 0);
590
- }
591
- }
592
-
593
  // allocate tensors
594
  for (int i = 0; i < graph->n_nodes; i++) {
595
  struct ggml_tensor * node = graph->nodes[i];
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
652
  }
653
  }
654
 
655
- bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
656
  size_t hash_size = graph->visited_hash_table.size;
657
 
658
  // initialize hash table
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
676
  }
677
 
678
  // allocate in hash table
679
- ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
680
 
681
  // set the node_allocs from the hash table
682
  if (galloc->n_nodes < graph->n_nodes) {
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
711
  }
712
  if (galloc->n_leafs < graph->n_leafs) {
713
  free(galloc->leaf_allocs);
714
- galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
715
  GGML_ASSERT(galloc->leaf_allocs != NULL);
716
  }
717
  galloc->n_leafs = graph->n_leafs;
718
  for (int i = 0; i < graph->n_leafs; i++) {
719
  struct ggml_tensor * leaf = graph->leafs[i];
720
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
721
- galloc->leaf_allocs[i].offset = hn->offset;
722
- galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
 
723
  }
724
 
725
  // reallocate buffers if needed
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
727
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
728
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
729
 
730
- if (new_size > cur_size) {
 
731
  #ifndef NDEBUG
732
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
733
  #endif
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
744
  }
745
 
746
  bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
747
- return ggml_gallocr_reserve_n(galloc, graph, NULL);
748
  }
749
 
750
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
751
- assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
752
 
753
- if (node->view_src != NULL) {
754
- if (node->buffer == NULL) {
755
  assert(tensor_alloc->offset == SIZE_MAX);
756
- if (node->view_src->buffer == NULL) {
757
  // this tensor was allocated without ggml-backend
758
  return;
759
  }
760
- ggml_backend_view_init(galloc->buffers[buffer_id], node);
761
  }
762
  } else {
763
- if (node->data == NULL) {
764
  assert(tensor_alloc->offset != SIZE_MAX);
765
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
766
  void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
767
  void * addr = (char *)base + tensor_alloc->offset;
768
- ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
769
  } else {
770
- if (node->buffer == NULL) {
771
  // this tensor was allocated without ggml-backend
772
  return;
773
  }
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
843
 
844
  // reset buffers
845
  for (int i = 0; i < galloc->n_buffers; i++) {
846
- // zero size buffers are not allocated
847
  if (galloc->buffers[i] != NULL) {
848
  ggml_backend_buffer_reset(galloc->buffers[i]);
849
  }
850
  }
851
 
852
  // allocate the graph tensors from the previous assignments
 
 
 
 
 
 
853
  // nodes
854
  for (int i = 0; i < graph->n_nodes; i++) {
855
  struct ggml_tensor * node = graph->nodes[i];
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
863
  }
864
  ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
865
  }
866
- // leafs
867
- for (int i = 0; i < graph->n_leafs; i++) {
868
- struct ggml_tensor * leaf = graph->leafs[i];
869
- struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
870
- ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
871
- }
872
 
873
  return true;
874
  }
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
900
  return false;
901
  }
902
 
903
- struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
904
 
905
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
906
  if (t->data == NULL) {
907
  if (t->view_src == NULL) {
908
- ggml_tallocr_alloc(tallocr, t);
909
  } else if (t->buffer == NULL) {
910
  ggml_backend_view_init(buffer, t);
911
  }
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
917
  }
918
  }
919
 
920
- ggml_tallocr_free(tallocr);
921
-
922
  *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
923
  (*buffers)[(*n_buffers)++] = buffer;
924
 
 
61
  }
62
  }
63
 
 
64
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
65
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
66
  size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
 
68
  }
69
 
70
  // tallocr
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
73
  void * base = ggml_backend_buffer_get_base(buffer);
74
  size_t align = ggml_backend_buffer_get_alignment(buffer);
75
 
76
  assert(align && !(align & (align - 1))); // power of 2
77
 
78
+ struct ggml_tallocr talloc = (struct ggml_tallocr) {
79
  /*.buffer = */ buffer,
80
  /*.base = */ base,
81
  /*.alignment = */ align,
 
84
  return talloc;
85
  }
86
 
87
+ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
 
 
 
 
88
  size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
89
  size = GGML_PAD(size, talloc->alignment);
90
 
 
338
  bool allocated;
339
  };
340
 
 
341
  struct tensor_alloc {
342
  size_t offset;
343
  size_t size_max; // 0 = pre-allocated, unused, or view
344
  };
345
 
346
+ struct leaf_alloc {
347
+ int buffer_id;
348
+ struct tensor_alloc leaf;
349
+ };
350
+
351
  struct node_alloc {
352
  int buffer_id;
353
  struct tensor_alloc dst;
 
366
  struct node_alloc * node_allocs; // [n_nodes]
367
  int n_nodes;
368
 
369
+ struct leaf_alloc * leaf_allocs; // [n_leafs]
370
  int n_leafs;
371
  };
372
 
 
531
  return node_buffer_ids ? node_buffer_ids[i] : 0;
532
  }
533
 
534
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
535
  // clear hash tables
536
  memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
537
  memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
538
 
539
+ // allocate leafs
540
+ // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
541
+ for (int i = 0; i < graph->n_leafs; i++) {
542
+ struct ggml_tensor * leaf = graph->leafs[i];
543
+ ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
544
+ }
545
+
546
  // count number of children and views
547
+ // allocate other graph inputs and leafs first to avoid overwriting them
548
  for (int i = 0; i < graph->n_nodes; i++) {
549
  struct ggml_tensor * node = graph->nodes[i];
550
 
 
572
  }
573
  }
574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  // allocate tensors
576
  for (int i = 0; i < graph->n_nodes; i++) {
577
  struct ggml_tensor * node = graph->nodes[i];
 
634
  }
635
  }
636
 
637
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
638
  size_t hash_size = graph->visited_hash_table.size;
639
 
640
  // initialize hash table
 
658
  }
659
 
660
  // allocate in hash table
661
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
662
 
663
  // set the node_allocs from the hash table
664
  if (galloc->n_nodes < graph->n_nodes) {
 
693
  }
694
  if (galloc->n_leafs < graph->n_leafs) {
695
  free(galloc->leaf_allocs);
696
+ galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
697
  GGML_ASSERT(galloc->leaf_allocs != NULL);
698
  }
699
  galloc->n_leafs = graph->n_leafs;
700
  for (int i = 0; i < graph->n_leafs; i++) {
701
  struct ggml_tensor * leaf = graph->leafs[i];
702
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
703
+ galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
704
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
705
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
706
  }
707
 
708
  // reallocate buffers if needed
 
710
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
711
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
712
 
713
+ // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
714
+ if (new_size > cur_size || galloc->buffers[i] == NULL) {
715
  #ifndef NDEBUG
716
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
717
  #endif
 
728
  }
729
 
730
  bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
731
+ return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
732
  }
733
 
734
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
735
+ assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
736
 
737
+ if (tensor->view_src != NULL) {
738
+ if (tensor->buffer == NULL) {
739
  assert(tensor_alloc->offset == SIZE_MAX);
740
+ if (tensor->view_src->buffer == NULL) {
741
  // this tensor was allocated without ggml-backend
742
  return;
743
  }
744
+ ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
745
  }
746
  } else {
747
+ if (tensor->data == NULL) {
748
  assert(tensor_alloc->offset != SIZE_MAX);
749
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
750
  void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
751
  void * addr = (char *)base + tensor_alloc->offset;
752
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
753
  } else {
754
+ if (tensor->buffer == NULL) {
755
  // this tensor was allocated without ggml-backend
756
  return;
757
  }
 
827
 
828
  // reset buffers
829
  for (int i = 0; i < galloc->n_buffers; i++) {
 
830
  if (galloc->buffers[i] != NULL) {
831
  ggml_backend_buffer_reset(galloc->buffers[i]);
832
  }
833
  }
834
 
835
  // allocate the graph tensors from the previous assignments
836
+ // leafs
837
+ for (int i = 0; i < graph->n_leafs; i++) {
838
+ struct ggml_tensor * leaf = graph->leafs[i];
839
+ struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
840
+ ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
841
+ }
842
  // nodes
843
  for (int i = 0; i < graph->n_nodes; i++) {
844
  struct ggml_tensor * node = graph->nodes[i];
 
852
  }
853
  ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
854
  }
 
 
 
 
 
 
855
 
856
  return true;
857
  }
 
883
  return false;
884
  }
885
 
886
+ struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
887
 
888
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
889
  if (t->data == NULL) {
890
  if (t->view_src == NULL) {
891
+ ggml_tallocr_alloc(&tallocr, t);
892
  } else if (t->buffer == NULL) {
893
  ggml_backend_view_init(buffer, t);
894
  }
 
900
  }
901
  }
902
 
 
 
903
  *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
904
  (*buffers)[(*n_buffers)++] = buffer;
905
 
ggml-alloc.h CHANGED
@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
11
  typedef struct ggml_backend * ggml_backend_t;
12
 
13
  // Tensor allocator
14
- typedef struct ggml_tallocr * ggml_tallocr_t;
 
 
 
 
 
15
 
16
- GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
17
- GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
18
- GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
19
 
20
  // Graph allocator
21
  /*
@@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
50
  // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
51
  // returns false if the buffer allocation failed
52
  GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
53
- GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
 
 
 
 
54
 
55
  // automatic reallocation if the topology changes when using a single buffer
56
  // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
 
11
  typedef struct ggml_backend * ggml_backend_t;
12
 
13
  // Tensor allocator
14
+ struct ggml_tallocr {
15
+ ggml_backend_buffer_t buffer;
16
+ void * base;
17
+ size_t alignment;
18
+ size_t offset;
19
+ };
20
 
21
+ GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
22
+ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
 
23
 
24
  // Graph allocator
25
  /*
 
54
  // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
55
  // returns false if the buffer allocation failed
56
  GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
57
+ GGML_API bool ggml_gallocr_reserve_n(
58
+ ggml_gallocr_t galloc,
59
+ struct ggml_cgraph * graph,
60
+ const int * node_buffer_ids,
61
+ const int * leaf_buffer_ids);
62
 
63
  // automatic reallocation if the topology changes when using a single buffer
64
  // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
ggml-backend-impl.h CHANGED
@@ -86,12 +86,12 @@ extern "C" {
86
  // (optional) asynchronous tensor data access
87
  void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
88
  void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
89
- bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
90
 
91
  // (optional) complete all pending operations
92
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
 
94
- // create a plan for ggml_cgraph and free it
95
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
 
@@ -102,16 +102,27 @@ extern "C" {
102
 
103
  // check if the backend supports an operation
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
 
 
 
 
 
 
 
105
  };
106
 
107
  struct ggml_backend {
108
  ggml_guid_t guid;
109
 
110
  struct ggml_backend_i iface;
111
-
112
  ggml_backend_context_t context;
113
  };
114
 
 
 
 
 
 
115
  //
116
  // Backend registry
117
  //
 
86
  // (optional) asynchronous tensor data access
87
  void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
88
  void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
89
+ bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
90
 
91
  // (optional) complete all pending operations
92
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
 
94
+ // compute graph with a plan (not used currently)
95
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
 
 
102
 
103
  // check if the backend supports an operation
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
+
106
+ // (optional) event synchronization
107
+ ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108
+ void (*GGML_CALL event_free) (ggml_backend_event_t event);
109
+ void (*GGML_CALL event_record) (ggml_backend_event_t event);
110
+ void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
111
+ void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
112
  };
113
 
114
  struct ggml_backend {
115
  ggml_guid_t guid;
116
 
117
  struct ggml_backend_i iface;
 
118
  ggml_backend_context_t context;
119
  };
120
 
121
+ struct ggml_backend_event {
122
+ ggml_backend_t backend;
123
+ void * context;
124
+ };
125
+
126
  //
127
  // Backend registry
128
  //
ggml-backend.c CHANGED
@@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
221
  GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
 
224
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
225
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
 
226
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
227
 
228
  if (!size) {
229
  return;
230
  }
231
 
232
- tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
233
  }
234
 
235
  GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
 
 
238
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
239
- GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
240
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
241
 
242
  if (!size) {
243
  return;
244
  }
245
 
246
- tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
247
  }
248
 
249
  void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
255
  }
256
 
257
  ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
 
 
258
  return backend->iface.graph_plan_create(backend, cgraph);
259
  }
260
 
261
  void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
 
 
262
  backend->iface.graph_plan_free(backend, plan);
263
  }
264
 
265
  enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
 
 
266
  return backend->iface.graph_plan_compute(backend, plan);
267
  }
268
 
269
  enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
 
 
 
 
 
 
270
  return backend->iface.graph_compute(backend, cgraph);
271
  }
272
 
@@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
314
  }
315
  }
316
 
317
- void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
318
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
319
 
320
  if (src == dst) {
321
  return;
322
  }
323
 
324
- if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
325
- if (backend->iface.cpy_tensor_async != NULL) {
326
- if (backend->iface.cpy_tensor_async(backend, src, dst)) {
327
- return;
328
- }
329
  }
330
  }
331
 
332
- size_t nbytes = ggml_nbytes(src);
 
333
  if (ggml_backend_buffer_is_host(src->buffer)) {
334
- ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
335
- }
336
- else {
 
337
  ggml_backend_tensor_copy(src, dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  }
340
 
 
 
 
 
 
341
 
342
  // backend registry
343
 
344
- #define GGML_MAX_BACKENDS_REG 16
345
 
346
  struct ggml_backend_reg {
347
  char name[128];
@@ -350,7 +396,7 @@ struct ggml_backend_reg {
350
  void * user_data;
351
  };
352
 
353
- static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
354
  static size_t ggml_backend_registry_count = 0;
355
 
356
  GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
@@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
395
  }
396
 
397
  GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
398
- GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
399
 
400
  size_t id = ggml_backend_registry_count;
401
 
@@ -746,8 +792,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
746
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
747
 
748
  if (cpu_ctx->work_size < cplan.work_size) {
749
- // TODO: may be faster to free and use malloc to avoid the copy
750
- cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
 
 
 
 
751
  cpu_ctx->work_size = cplan.work_size;
752
  }
753
  cplan.work_data = cpu_ctx->work_data;
@@ -784,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = {
784
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
785
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
786
  /* .supports_op = */ ggml_backend_cpu_supports_op,
 
 
 
 
 
787
  };
788
 
789
  static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -939,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
939
 
940
  // scheduler
941
 
942
- #define GGML_MAX_BACKENDS 16
943
- #define GGML_MAX_SPLITS 256
944
- #define GGML_MAX_SPLIT_INPUTS 16
 
 
 
 
 
 
 
 
 
 
 
 
945
 
946
  struct ggml_backend_sched_split {
947
  int backend_id;
948
  int i_start;
949
  int i_end;
950
- struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
951
  int n_inputs;
952
  // graph view of this split
953
  struct ggml_cgraph graph;
@@ -955,45 +1022,53 @@ struct ggml_backend_sched_split {
955
 
956
  struct ggml_backend_sched {
957
  bool is_reset; // true if the scheduler has been reset since the last graph split
 
958
 
959
  int n_backends;
960
- ggml_backend_t backends[GGML_MAX_BACKENDS];
961
- ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
962
 
 
 
963
  ggml_gallocr_t galloc;
964
 
965
  // hash keys of the nodes in the graph
966
  struct ggml_hash_set hash_set;
967
  // hash values
968
  int * tensor_backend_id;
969
- struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
970
 
971
- int * node_backend_ids; // [n_nodes]
972
- int n_nodes;
973
 
974
  // copy of the graph with modified inputs
975
  struct ggml_cgraph * graph;
976
 
977
- struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
 
978
  int n_splits;
979
 
 
 
 
 
 
 
 
980
  struct ggml_context * ctx;
981
 
982
  ggml_backend_sched_eval_callback callback_eval;
983
  void * callback_eval_user_data;
984
 
985
  // align context_buffer to GGML_MEM_ALIGN
986
- #ifdef _MSC_VER
987
  __declspec(align(GGML_MEM_ALIGN))
988
- #else
989
  __attribute__((aligned(GGML_MEM_ALIGN)))
990
- #endif
991
- char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
992
  };
993
 
994
- #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
995
- #define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
996
- #define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
997
 
998
  // returns the priority of the backend, lower id is higher priority
999
  static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@@ -1005,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1005
  return -1;
1006
  }
1007
 
1008
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
 
1009
  if (buffer == NULL) {
1010
  return -1;
1011
  }
@@ -1016,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
1016
  return i;
1017
  }
1018
  }
1019
- GGML_ASSERT(false && "tensor buffer type not supported by any backend");
1020
- return -1; // silence warning
 
 
 
 
1021
  }
1022
 
1023
  #if 0
1024
- static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
1025
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1026
  #define GET_CAUSE(node) causes[hash_id(node)]
1027
  #else
@@ -1035,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1035
 
1036
  // assign pre-allocated nodes to their backend
1037
  // dst
1038
- int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
1039
  if (cur_backend != -1) {
1040
- SET_CAUSE(node, "1.dst");
1041
  return cur_backend;
1042
  }
 
1043
  // view_src
1044
  if (tensor->view_src != NULL) {
1045
- cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
1046
  if (cur_backend != -1) {
1047
- SET_CAUSE(node, "1.vsrc");
1048
  return cur_backend;
1049
  }
1050
  }
 
 
 
 
 
 
 
 
1051
  // assign nodes that use weights to the backend of the weights
1052
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1053
  const struct ggml_tensor * src = tensor->src[i];
@@ -1055,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1055
  continue;
1056
  }
1057
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1058
- int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
1059
  // operations with weights are always run on the same backend as the weights
1060
- SET_CAUSE(node, "1.wgt%d", i);
1061
  return src_backend;
1062
  }
1063
  }
@@ -1093,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1093
  if (ggml_is_view_op(node->op)) {
1094
  continue;
1095
  }
1096
- ggml_backend_t tensor_backend = tensor_backend(node);
1097
  fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1098
  fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1099
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1101,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1101
  if (src == NULL) {
1102
  continue;
1103
  }
1104
- ggml_backend_t src_backend = tensor_backend(src);
1105
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1106
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1107
  }
@@ -1118,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1118
  static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1119
  // reset splits
1120
  sched->n_splits = 0;
 
1121
  sched->is_reset = false;
1122
 
1123
  struct ggml_init_params params = {
@@ -1163,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1163
  }
1164
  }
1165
  #ifdef DEBUG_PASS1
1166
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1167
  #endif
1168
 
1169
  // pass 2: expand current backend assignments
@@ -1171,10 +1261,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1171
  // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1172
  // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1173
 
1174
- // pass 2.1 expand gpu up
 
1175
  {
1176
  int cur_backend_id = -1;
1177
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1178
  struct ggml_tensor * node = graph->nodes[i];
1179
  if (ggml_is_view_op(node->op)) {
1180
  continue;
@@ -1189,15 +1280,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1189
  }
1190
  } else {
1191
  tensor_backend_id(node) = cur_backend_id;
1192
- SET_CAUSE(node, "2.1");
1193
  }
1194
  }
1195
  }
1196
 
1197
- // pass 2.2 expand gpu down
1198
  {
1199
  int cur_backend_id = -1;
1200
- for (int i = 0; i < graph->n_nodes; i++) {
1201
  struct ggml_tensor * node = graph->nodes[i];
1202
  if (ggml_is_view_op(node->op)) {
1203
  continue;
@@ -1212,15 +1303,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1212
  }
1213
  } else {
1214
  tensor_backend_id(node) = cur_backend_id;
1215
- SET_CAUSE(node, "2.2");
1216
  }
1217
  }
1218
  }
1219
 
1220
- // pass 2.3 expand rest up
 
1221
  {
1222
  int cur_backend_id = -1;
1223
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1224
  struct ggml_tensor * node = graph->nodes[i];
1225
  if (ggml_is_view_op(node->op)) {
1226
  continue;
@@ -1230,15 +1322,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1230
  cur_backend_id = tensor_backend_id;
1231
  } else {
1232
  tensor_backend_id(node) = cur_backend_id;
1233
- SET_CAUSE(node, "2.3");
1234
  }
1235
  }
1236
  }
1237
-
1238
- // pass 2.4 expand rest down
1239
  {
1240
  int cur_backend_id = -1;
1241
- for (int i = 0; i < graph->n_nodes; i++) {
1242
  struct ggml_tensor * node = graph->nodes[i];
1243
  if (ggml_is_view_op(node->op)) {
1244
  continue;
@@ -1248,12 +1339,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1248
  cur_backend_id = tensor_backend_id;
1249
  } else {
1250
  tensor_backend_id(node) = cur_backend_id;
1251
- SET_CAUSE(node, "2.4");
1252
  }
1253
  }
1254
  }
 
1255
  #ifdef DEBUG_PASS2
1256
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1257
  #endif
1258
 
1259
  // pass 3: assign backends to remaining src from dst and view_src
@@ -1283,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1283
  }
1284
  }
1285
  #ifdef DEBUG_PASS3
1286
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1287
  #endif
1288
 
1289
  // pass 4: split graph, find tensors that need to be copied
@@ -1315,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1315
  if (tensor_backend_id != cur_backend_id) {
1316
  sched->splits[cur_split].i_end = i;
1317
  cur_split++;
1318
- GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1319
  sched->splits[cur_split].backend_id = tensor_backend_id;
1320
  sched->splits[cur_split].i_start = i;
1321
  sched->splits[cur_split].n_inputs = 0;
@@ -1328,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1328
  if (src == NULL) {
1329
  continue;
1330
  }
 
1331
  int src_backend_id = tensor_backend_id(src);
1332
  assert(src_backend_id != -1); // all inputs should be assigned by now
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1333
  if (src_backend_id != tensor_backend_id) {
1334
  // create a copy of the input in the split's backend
1335
  size_t id = hash_id(src);
1336
- if (sched->tensor_copies[id][cur_backend_id] == NULL) {
1337
  ggml_backend_t backend = sched->backends[cur_backend_id];
1338
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1339
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1340
-
1341
- sched->tensor_copies[id][cur_backend_id] = tensor_copy;
1342
- tensor_backend_id(tensor_copy) = cur_backend_id;
1343
- SET_CAUSE(tensor_copy, "4.cpy");
1344
-
 
 
 
 
1345
  int n_inputs = sched->splits[cur_split].n_inputs++;
1346
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1347
  sched->splits[cur_split].inputs[n_inputs] = src;
1348
  }
1349
- node->src[j] = sched->tensor_copies[id][cur_backend_id];
1350
  }
1351
  }
1352
  }
@@ -1354,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1354
  sched->n_splits = cur_split + 1;
1355
  }
1356
  #ifdef DEBUG_PASS4
1357
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1358
  #endif
1359
 
1360
  #ifndef NDEBUG
1361
  // sanity check: all sources should have the same backend as the node
1362
  for (int i = 0; i < graph->n_nodes; i++) {
1363
  struct ggml_tensor * node = graph->nodes[i];
1364
- ggml_backend_t tensor_backend = tensor_backend(node);
1365
  if (tensor_backend == NULL) {
1366
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1367
  }
1368
- if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
1369
  fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1370
  node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1371
- node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
 
1372
  }
1373
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1374
  struct ggml_tensor * src = node->src[j];
1375
  if (src == NULL) {
1376
  continue;
1377
  }
1378
- ggml_backend_t src_backend = tensor_backend(src);
1379
  if (src_backend != tensor_backend /* && src_backend != NULL */) {
1380
  fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1381
  node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1382
  j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1383
  }
1384
- if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
1385
  fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1386
  src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1387
- src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
 
1388
  }
1389
  }
1390
  }
@@ -1392,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1392
  #endif
1393
 
1394
  // create copies of the graph for each split
1395
- // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way
1396
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
1397
  for (int i = 0; i < sched->n_splits; i++) {
1398
  struct ggml_backend_sched_split * split = &sched->splits[i];
1399
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1400
 
 
1401
  for (int j = 0; j < split->n_inputs; j++) {
1402
  struct ggml_tensor * input = split->inputs[j];
1403
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
1404
 
1405
  // add a dependency to the input source so that it is not freed before the copy is done
1406
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
 
1407
  sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1408
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1409
 
@@ -1417,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1417
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1418
  }
1419
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1420
  sched->graph = graph_copy;
1421
  }
1422
 
1423
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1424
- // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1425
  if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
 
 
1426
  #ifndef NDEBUG
1427
- fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
1428
  #endif
1429
- ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1430
  if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1431
- fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
1432
  return false;
1433
  }
1434
  }
@@ -1437,9 +1603,6 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1437
  }
1438
 
1439
  static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1440
- uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
1441
- uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
1442
-
1443
  struct ggml_backend_sched_split * splits = sched->splits;
1444
 
1445
  for (int i = 0; i < sched->n_splits; i++) {
@@ -1448,34 +1611,36 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1448
  ggml_backend_t split_backend = sched->backends[split_backend_id];
1449
 
1450
  // copy the input tensors to the split backend
1451
- uint64_t copy_start_us = ggml_time_us();
1452
  for (int j = 0; j < split->n_inputs; j++) {
 
1453
  struct ggml_tensor * input = split->inputs[j];
1454
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
1455
 
1456
- GGML_ASSERT(input->buffer != NULL);
1457
- GGML_ASSERT(input_cpy->buffer != NULL);
 
 
 
 
 
 
 
 
 
 
 
 
 
1458
 
1459
- ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
 
1460
  }
1461
- //ggml_backend_synchronize(split_backend); // necessary to measure copy time
1462
- int64_t copy_end_us = ggml_time_us();
1463
- copy_us[split_backend_id] += copy_end_us - copy_start_us;
1464
 
1465
- #if 0
1466
- char split_filename[GGML_MAX_NAME];
1467
- snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
1468
- ggml_graph_dump_dot(split->graph, NULL, split_filename);
1469
- #endif
1470
-
1471
-
1472
- uint64_t compute_start_us = ggml_time_us();
1473
  if (!sched->callback_eval) {
1474
- enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
1475
  if (ec != GGML_STATUS_SUCCESS) {
1476
  return ec;
1477
  }
1478
- //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1479
  } else {
1480
  // similar to ggml_backend_compare_graph_backend
1481
  for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
@@ -1494,11 +1659,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1494
 
1495
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1496
 
1497
- enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
1498
  if (ec != GGML_STATUS_SUCCESS) {
1499
  return ec;
1500
  }
1501
 
 
 
 
1502
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1503
  break;
1504
  }
@@ -1506,39 +1674,54 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1506
  j0 = j1;
1507
  }
1508
  }
1509
- uint64_t compute_end_us = ggml_time_us();
1510
- compute_us[split_backend_id] += compute_end_us - compute_start_us;
1511
- }
1512
 
1513
- #if 0
1514
- // per-backend timings
1515
- fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
1516
- for (int i = 0; i < sched->n_backends; i++) {
1517
- if (copy_us[i] > 0 || compute_us[i] > 0) {
1518
- fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
1519
  }
1520
  }
1521
- #endif
 
1522
 
1523
  return GGML_STATUS_SUCCESS;
1524
  }
1525
 
1526
- ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
 
 
 
 
 
1527
  GGML_ASSERT(n_backends > 0);
1528
- GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
 
1529
 
1530
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1531
 
1532
  // initialize hash table
1533
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1534
  sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1535
  sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1536
  sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
 
1537
 
1538
  sched->n_backends = n_backends;
1539
- for (int i = 0; i < n_backends; i++) {
1540
- sched->backends[i] = backends[i];
1541
- sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
 
 
 
 
 
 
 
 
 
 
 
1542
  }
1543
 
1544
  sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
@@ -1552,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1552
  if (sched == NULL) {
1553
  return;
1554
  }
 
 
 
 
 
1555
  ggml_gallocr_free(sched->galloc);
1556
  ggml_free(sched->ctx);
1557
  free(sched->hash_set.keys);
1558
  free(sched->tensor_backend_id);
1559
  free(sched->tensor_copies);
1560
  free(sched->node_backend_ids);
 
1561
  free(sched);
1562
  }
1563
 
@@ -1569,34 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1569
  memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1570
 
1571
  sched->is_reset = true;
 
1572
  }
1573
 
1574
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1575
  ggml_backend_sched_split_graph(sched, measure_graph);
1576
 
1577
- if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
 
1578
  return false;
1579
  }
1580
 
1581
  ggml_backend_sched_reset(sched);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1582
  return true;
1583
  }
1584
 
1585
  enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1586
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
 
 
 
1587
 
1588
- if (!sched->is_reset) {
 
1589
  ggml_backend_sched_reset(sched);
1590
  }
1591
 
1592
- ggml_backend_sched_split_graph(sched, graph);
1593
- if (!ggml_backend_sched_alloc_splits(sched)) {
1594
- return GGML_STATUS_ALLOC_FAILED;
 
1595
  }
1596
 
1597
  return ggml_backend_sched_compute_splits(sched);
1598
  }
1599
 
 
 
 
 
 
 
1600
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1601
  sched->callback_eval = callback;
1602
  sched->callback_eval_user_data = user_data;
@@ -1606,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1606
  return sched->n_splits;
1607
  }
1608
 
 
 
 
 
1609
  size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1610
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1611
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
 
1612
  return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1613
  }
1614
 
1615
- void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1616
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1617
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1618
  tensor_backend_id(node) = backend_index;
1619
  }
1620
 
1621
- ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1622
  int backend_index = tensor_backend_id(node);
1623
  if (backend_index == -1) {
1624
  return NULL;
 
221
  GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
 
 
224
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
225
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
226
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
227
 
228
  if (!size) {
229
  return;
230
  }
231
 
232
+ buf->iface.set_tensor(buf, tensor, data, offset, size);
233
  }
234
 
235
  GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
 
238
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
239
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
 
240
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
241
 
242
  if (!size) {
243
  return;
244
  }
245
 
246
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
247
  }
248
 
249
  void ggml_backend_synchronize(ggml_backend_t backend) {
 
255
  }
256
 
257
  ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
258
+ GGML_ASSERT(backend->iface.graph_plan_create != NULL);
259
+
260
  return backend->iface.graph_plan_create(backend, cgraph);
261
  }
262
 
263
  void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
264
+ GGML_ASSERT(backend->iface.graph_plan_free != NULL);
265
+
266
  backend->iface.graph_plan_free(backend, plan);
267
  }
268
 
269
  enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
270
+ GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
271
+
272
  return backend->iface.graph_plan_compute(backend, plan);
273
  }
274
 
275
  enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
276
+ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
277
+ ggml_backend_synchronize(backend);
278
+ return err;
279
+ }
280
+
281
+ bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
282
  return backend->iface.graph_compute(backend, cgraph);
283
  }
284
 
 
326
  }
327
  }
328
 
329
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
330
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
331
 
332
  if (src == dst) {
333
  return;
334
  }
335
 
336
+ if (backend_dst->iface.cpy_tensor_async != NULL) {
337
+ if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
338
+ return;
 
 
339
  }
340
  }
341
 
342
+ // an async copy would normally happen after all the queued operations on both backends are completed
343
+ // sync src, set_async dst
344
  if (ggml_backend_buffer_is_host(src->buffer)) {
345
+ ggml_backend_synchronize(backend_src);
346
+ ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
347
+ } else {
348
+ ggml_backend_synchronize(backend_src);
349
  ggml_backend_tensor_copy(src, dst);
350
+ ggml_backend_synchronize(backend_dst);
351
+ }
352
+ }
353
+
354
+ // events
355
+
356
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
357
+ if (backend->iface.event_new == NULL) {
358
+ return NULL;
359
+ }
360
+ return backend->iface.event_new(backend);
361
+ }
362
+
363
+ void ggml_backend_event_free(ggml_backend_event_t event) {
364
+ if (event == NULL) {
365
+ return;
366
  }
367
+ event->backend->iface.event_free(event);
368
+ }
369
+
370
+ void ggml_backend_event_record(ggml_backend_event_t event) {
371
+ GGML_ASSERT(event->backend->iface.event_record != NULL);
372
+
373
+ event->backend->iface.event_record(event);
374
+ }
375
+
376
+ void ggml_backend_event_synchronize(ggml_backend_event_t event) {
377
+ GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
378
+
379
+ event->backend->iface.event_synchronize(event);
380
  }
381
 
382
+ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
383
+ GGML_ASSERT(backend->iface.event_wait != NULL);
384
+
385
+ backend->iface.event_wait(backend, event);
386
+ }
387
 
388
  // backend registry
389
 
390
+ #define GGML_REG_MAX_BACKENDS 16
391
 
392
  struct ggml_backend_reg {
393
  char name[128];
 
396
  void * user_data;
397
  };
398
 
399
+ static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
400
  static size_t ggml_backend_registry_count = 0;
401
 
402
  GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
 
441
  }
442
 
443
  GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
444
+ GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
445
 
446
  size_t id = ggml_backend_registry_count;
447
 
 
792
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
793
 
794
  if (cpu_ctx->work_size < cplan.work_size) {
795
+ free(cpu_ctx->work_data);
796
+ cpu_ctx->work_data = malloc(cplan.work_size);
797
+ if (cpu_ctx->work_data == NULL) {
798
+ cpu_ctx->work_size = 0;
799
+ return GGML_STATUS_ALLOC_FAILED;
800
+ }
801
  cpu_ctx->work_size = cplan.work_size;
802
  }
803
  cplan.work_data = cpu_ctx->work_data;
 
834
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
835
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
836
  /* .supports_op = */ ggml_backend_cpu_supports_op,
837
+ /* .event_new = */ NULL,
838
+ /* .event_free = */ NULL,
839
+ /* .event_record = */ NULL,
840
+ /* .event_wait = */ NULL,
841
+ /* .event_synchronize = */ NULL,
842
  };
843
 
844
  static ggml_guid_t ggml_backend_cpu_guid(void) {
 
994
 
995
  // scheduler
996
 
997
+ #ifndef GGML_SCHED_MAX_BACKENDS
998
+ #define GGML_SCHED_MAX_BACKENDS 16
999
+ #endif
1000
+
1001
+ #ifndef GGML_SCHED_MAX_SPLITS
1002
+ #define GGML_SCHED_MAX_SPLITS 256
1003
+ #endif
1004
+
1005
+ #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1006
+ #define GGML_SCHED_MAX_SPLIT_INPUTS 16
1007
+ #endif
1008
+
1009
+ #ifndef GGML_SCHED_MAX_COPIES
1010
+ #define GGML_SCHED_MAX_COPIES 4
1011
+ #endif
1012
 
1013
  struct ggml_backend_sched_split {
1014
  int backend_id;
1015
  int i_start;
1016
  int i_end;
1017
+ struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1018
  int n_inputs;
1019
  // graph view of this split
1020
  struct ggml_cgraph graph;
 
1022
 
1023
  struct ggml_backend_sched {
1024
  bool is_reset; // true if the scheduler has been reset since the last graph split
1025
+ bool is_alloc;
1026
 
1027
  int n_backends;
 
 
1028
 
1029
+ ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
1030
+ ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
1031
  ggml_gallocr_t galloc;
1032
 
1033
  // hash keys of the nodes in the graph
1034
  struct ggml_hash_set hash_set;
1035
  // hash values
1036
  int * tensor_backend_id;
1037
+ struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1038
 
1039
+ int * node_backend_ids; // [graph_size]
1040
+ int * leaf_backend_ids; // [graph_size]
1041
 
1042
  // copy of the graph with modified inputs
1043
  struct ggml_cgraph * graph;
1044
 
1045
+ // graph splits
1046
+ struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
1047
  int n_splits;
1048
 
1049
+ // pipeline parallelism support
1050
+ int n_copies;
1051
+ int cur_copy;
1052
+ ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1053
+ struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1054
+ int n_graph_inputs;
1055
+
1056
  struct ggml_context * ctx;
1057
 
1058
  ggml_backend_sched_eval_callback callback_eval;
1059
  void * callback_eval_user_data;
1060
 
1061
  // align context_buffer to GGML_MEM_ALIGN
1062
+ #ifdef _MSC_VER
1063
  __declspec(align(GGML_MEM_ALIGN))
1064
+ #else
1065
  __attribute__((aligned(GGML_MEM_ALIGN)))
1066
+ #endif
1067
+ char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
1068
  };
1069
 
1070
+ #define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
1071
+ #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
 
1072
 
1073
  // returns the priority of the backend, lower id is higher priority
1074
  static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
 
1080
  return -1;
1081
  }
1082
 
1083
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
1084
+ ggml_backend_buffer_t buffer = tensor->buffer;
1085
  if (buffer == NULL) {
1086
  return -1;
1087
  }
 
1092
  return i;
1093
  }
1094
  }
1095
+
1096
+ fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
1097
+ __func__, ggml_backend_buffer_name(buffer), tensor->name);
1098
+ GGML_ASSERT(false);
1099
+
1100
+ return -1;
1101
  }
1102
 
1103
  #if 0
1104
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1105
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1106
  #define GET_CAUSE(node) causes[hash_id(node)]
1107
  #else
 
1115
 
1116
  // assign pre-allocated nodes to their backend
1117
  // dst
1118
+ int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
1119
  if (cur_backend != -1) {
1120
+ SET_CAUSE(tensor, "1.dst");
1121
  return cur_backend;
1122
  }
1123
+
1124
  // view_src
1125
  if (tensor->view_src != NULL) {
1126
+ cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1127
  if (cur_backend != -1) {
1128
+ SET_CAUSE(tensor, "1.vsrc");
1129
  return cur_backend;
1130
  }
1131
  }
1132
+
1133
+ // input
1134
+ if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1135
+ cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
1136
+ SET_CAUSE(tensor, "1.inp");
1137
+ return cur_backend;
1138
+ }
1139
+
1140
  // assign nodes that use weights to the backend of the weights
1141
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1142
  const struct ggml_tensor * src = tensor->src[i];
 
1144
  continue;
1145
  }
1146
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1147
+ int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
1148
  // operations with weights are always run on the same backend as the weights
1149
+ SET_CAUSE(tensor, "1.wgt%d", i);
1150
  return src_backend;
1151
  }
1152
  }
 
1182
  if (ggml_is_view_op(node->op)) {
1183
  continue;
1184
  }
1185
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1186
  fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1187
  fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1188
  for (int j = 0; j < GGML_MAX_SRC; j++) {
 
1190
  if (src == NULL) {
1191
  continue;
1192
  }
1193
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1194
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1195
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1196
  }
 
1207
  static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1208
  // reset splits
1209
  sched->n_splits = 0;
1210
+ sched->n_graph_inputs = 0;
1211
  sched->is_reset = false;
1212
 
1213
  struct ggml_init_params params = {
 
1253
  }
1254
  }
1255
  #ifdef DEBUG_PASS1
1256
+ fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1257
  #endif
1258
 
1259
  // pass 2: expand current backend assignments
 
1261
  // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1262
  // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1263
 
1264
+
1265
+ // pass 2.2 expand gpu down
1266
  {
1267
  int cur_backend_id = -1;
1268
+ for (int i = 0; i < graph->n_nodes; i++) {
1269
  struct ggml_tensor * node = graph->nodes[i];
1270
  if (ggml_is_view_op(node->op)) {
1271
  continue;
 
1280
  }
1281
  } else {
1282
  tensor_backend_id(node) = cur_backend_id;
1283
+ SET_CAUSE(node, "2.2");
1284
  }
1285
  }
1286
  }
1287
 
1288
+ // pass 2.1 expand gpu up
1289
  {
1290
  int cur_backend_id = -1;
1291
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1292
  struct ggml_tensor * node = graph->nodes[i];
1293
  if (ggml_is_view_op(node->op)) {
1294
  continue;
 
1303
  }
1304
  } else {
1305
  tensor_backend_id(node) = cur_backend_id;
1306
+ SET_CAUSE(node, "2.1");
1307
  }
1308
  }
1309
  }
1310
 
1311
+
1312
+ // pass 2.4 expand rest down
1313
  {
1314
  int cur_backend_id = -1;
1315
+ for (int i = 0; i < graph->n_nodes; i++) {
1316
  struct ggml_tensor * node = graph->nodes[i];
1317
  if (ggml_is_view_op(node->op)) {
1318
  continue;
 
1322
  cur_backend_id = tensor_backend_id;
1323
  } else {
1324
  tensor_backend_id(node) = cur_backend_id;
1325
+ SET_CAUSE(node, "2.4");
1326
  }
1327
  }
1328
  }
1329
+ // pass 2.3 expand rest up
 
1330
  {
1331
  int cur_backend_id = -1;
1332
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1333
  struct ggml_tensor * node = graph->nodes[i];
1334
  if (ggml_is_view_op(node->op)) {
1335
  continue;
 
1339
  cur_backend_id = tensor_backend_id;
1340
  } else {
1341
  tensor_backend_id(node) = cur_backend_id;
1342
+ SET_CAUSE(node, "2.3");
1343
  }
1344
  }
1345
  }
1346
+
1347
  #ifdef DEBUG_PASS2
1348
+ fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1349
  #endif
1350
 
1351
  // pass 3: assign backends to remaining src from dst and view_src
 
1375
  }
1376
  }
1377
  #ifdef DEBUG_PASS3
1378
+ fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1379
  #endif
1380
 
1381
  // pass 4: split graph, find tensors that need to be copied
 
1407
  if (tensor_backend_id != cur_backend_id) {
1408
  sched->splits[cur_split].i_end = i;
1409
  cur_split++;
1410
+ GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
1411
  sched->splits[cur_split].backend_id = tensor_backend_id;
1412
  sched->splits[cur_split].i_start = i;
1413
  sched->splits[cur_split].n_inputs = 0;
 
1420
  if (src == NULL) {
1421
  continue;
1422
  }
1423
+
1424
  int src_backend_id = tensor_backend_id(src);
1425
  assert(src_backend_id != -1); // all inputs should be assigned by now
1426
+
1427
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
1428
+ size_t id = hash_id(src);
1429
+ if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1430
+ ggml_backend_t backend = sched->backends[src_backend_id];
1431
+ for (int c = 0; c < sched->n_copies; c++) {
1432
+ struct ggml_tensor * tensor_copy;
1433
+ if (c == sched->cur_copy) {
1434
+ tensor_copy = src; // use the original tensor as the current copy
1435
+ } else {
1436
+ tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1437
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1438
+ }
1439
+ if (sched->n_copies > 1) {
1440
+ ggml_set_input(tensor_copy);
1441
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1442
+ }
1443
+ sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1444
+ tensor_backend_id(tensor_copy) = src_backend_id;
1445
+ SET_CAUSE(tensor_copy, "4.cpy");
1446
+ }
1447
+ int n_graph_inputs = sched->n_graph_inputs++;
1448
+ GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1449
+ sched->graph_inputs[n_graph_inputs] = src;
1450
+ }
1451
+ }
1452
+
1453
  if (src_backend_id != tensor_backend_id) {
1454
  // create a copy of the input in the split's backend
1455
  size_t id = hash_id(src);
1456
+ if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1457
  ggml_backend_t backend = sched->backends[cur_backend_id];
1458
+ for (int c = 0; c < sched->n_copies; c++) {
1459
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1460
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1461
+ if (sched->n_copies > 1) {
1462
+ ggml_set_input(tensor_copy);
1463
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1464
+ }
1465
+ sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1466
+ tensor_backend_id(tensor_copy) = cur_backend_id;
1467
+ SET_CAUSE(tensor_copy, "4.cpy");
1468
+ }
1469
  int n_inputs = sched->splits[cur_split].n_inputs++;
1470
+ GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1471
  sched->splits[cur_split].inputs[n_inputs] = src;
1472
  }
1473
+ node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1474
  }
1475
  }
1476
  }
 
1478
  sched->n_splits = cur_split + 1;
1479
  }
1480
  #ifdef DEBUG_PASS4
1481
+ fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1482
  #endif
1483
 
1484
  #ifndef NDEBUG
1485
  // sanity check: all sources should have the same backend as the node
1486
  for (int i = 0; i < graph->n_nodes; i++) {
1487
  struct ggml_tensor * node = graph->nodes[i];
1488
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1489
  if (tensor_backend == NULL) {
1490
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1491
  }
1492
+ if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
1493
  fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1494
  node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1495
+ node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
1496
+ ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
1497
  }
1498
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1499
  struct ggml_tensor * src = node->src[j];
1500
  if (src == NULL) {
1501
  continue;
1502
  }
1503
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1504
  if (src_backend != tensor_backend /* && src_backend != NULL */) {
1505
  fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1506
  node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1507
  j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1508
  }
1509
+ if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
1510
  fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1511
  src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1512
+ src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
1513
+ ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
1514
  }
1515
  }
1516
  }
 
1518
  #endif
1519
 
1520
  // create copies of the graph for each split
1521
+ // TODO: avoid this copy
1522
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
1523
  for (int i = 0; i < sched->n_splits; i++) {
1524
  struct ggml_backend_sched_split * split = &sched->splits[i];
1525
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1526
 
1527
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1528
  for (int j = 0; j < split->n_inputs; j++) {
1529
  struct ggml_tensor * input = split->inputs[j];
1530
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
1531
 
1532
  // add a dependency to the input source so that it is not freed before the copy is done
1533
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1534
+ input_dep->src[0] = input;
1535
  sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1536
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1537
 
 
1545
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1546
  }
1547
  }
1548
+
1549
+ if (sched->n_copies > 1) {
1550
+ // add input copies as leafs so that they are allocated first
1551
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1552
+ struct ggml_tensor * input = sched->graph_inputs[i];
1553
+ size_t id = hash_id(input);
1554
+ int backend_id = tensor_backend_id(input);
1555
+ for (int c = 0; c < sched->n_copies; c++) {
1556
+ struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1557
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1558
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1559
+ }
1560
+ }
1561
+
1562
+ for (int i = 0; i < sched->n_splits; i++) {
1563
+ struct ggml_backend_sched_split * split = &sched->splits[i];
1564
+ int backend_id = split->backend_id;
1565
+ for (int j = 0; j < split->n_inputs; j++) {
1566
+ struct ggml_tensor * input = split->inputs[j];
1567
+ size_t id = hash_id(input);
1568
+ for (int c = 0; c < sched->n_copies; c++) {
1569
+ struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1570
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1571
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1572
+ }
1573
+ }
1574
+ }
1575
+ }
1576
+
1577
+ // add leafs from the original graph
1578
+ for (int i = 0; i < graph->n_leafs; i++) {
1579
+ struct ggml_tensor * leaf = graph->leafs[i];
1580
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1581
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1582
+ }
1583
+
1584
  sched->graph = graph_copy;
1585
  }
1586
 
1587
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1588
+ // allocate graph
1589
  if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1590
+ // the re-allocation may cause the split inputs to be moved to a different address
1591
+ ggml_backend_sched_synchronize(sched);
1592
  #ifndef NDEBUG
1593
+ fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
1594
  #endif
1595
+ ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1596
  if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1597
+ fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1598
  return false;
1599
  }
1600
  }
 
1603
  }
1604
 
1605
  static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
 
 
 
1606
  struct ggml_backend_sched_split * splits = sched->splits;
1607
 
1608
  for (int i = 0; i < sched->n_splits; i++) {
 
1611
  ggml_backend_t split_backend = sched->backends[split_backend_id];
1612
 
1613
  // copy the input tensors to the split backend
 
1614
  for (int j = 0; j < split->n_inputs; j++) {
1615
+ ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1616
  struct ggml_tensor * input = split->inputs[j];
1617
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
1618
 
1619
+ if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1620
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1621
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1622
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1623
+ } else {
1624
+ ggml_backend_synchronize(split_backend);
1625
+ }
1626
+ ggml_backend_tensor_copy(input, input_cpy);
1627
+ } else {
1628
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1629
+ ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1630
+ } else {
1631
+ ggml_backend_synchronize(split_backend);
1632
+ ggml_backend_synchronize(input_backend);
1633
+ }
1634
 
1635
+ ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1636
+ }
1637
  }
 
 
 
1638
 
 
 
 
 
 
 
 
 
1639
  if (!sched->callback_eval) {
1640
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
1641
  if (ec != GGML_STATUS_SUCCESS) {
1642
  return ec;
1643
  }
 
1644
  } else {
1645
  // similar to ggml_backend_compare_graph_backend
1646
  for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
 
1659
 
1660
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1661
 
1662
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
1663
  if (ec != GGML_STATUS_SUCCESS) {
1664
  return ec;
1665
  }
1666
 
1667
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1668
+ ggml_backend_synchronize(split_backend);
1669
+
1670
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1671
  break;
1672
  }
 
1674
  j0 = j1;
1675
  }
1676
  }
 
 
 
1677
 
1678
+ // record the event of this copy
1679
+ if (split->n_inputs > 0) {
1680
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1681
+ ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1682
+ }
 
1683
  }
1684
  }
1685
+
1686
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1687
 
1688
  return GGML_STATUS_SUCCESS;
1689
  }
1690
 
1691
+ ggml_backend_sched_t ggml_backend_sched_new(
1692
+ ggml_backend_t * backends,
1693
+ ggml_backend_buffer_type_t * bufts,
1694
+ int n_backends,
1695
+ size_t graph_size,
1696
+ bool parallel) {
1697
  GGML_ASSERT(n_backends > 0);
1698
+ GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1699
+ GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1700
 
1701
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1702
 
1703
  // initialize hash table
1704
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1705
  sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1706
  sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1707
  sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1708
+ sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
1709
 
1710
  sched->n_backends = n_backends;
1711
+
1712
+ sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1713
+
1714
+ GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
1715
+
1716
+ for (int b = 0; b < n_backends; b++) {
1717
+ sched->backends[b] = backends[b];
1718
+ sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1719
+ GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
1720
+ if (sched->n_copies > 1) {
1721
+ for (int c = 0; c < sched->n_copies; c++) {
1722
+ sched->events[b][c] = ggml_backend_event_new(backends[b]);
1723
+ }
1724
+ }
1725
  }
1726
 
1727
  sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
 
1735
  if (sched == NULL) {
1736
  return;
1737
  }
1738
+ for (int b = 0; b < sched->n_backends; b++) {
1739
+ for (int c = 0; c < sched->n_copies; c++) {
1740
+ ggml_backend_event_free(sched->events[b][c]);
1741
+ }
1742
+ }
1743
  ggml_gallocr_free(sched->galloc);
1744
  ggml_free(sched->ctx);
1745
  free(sched->hash_set.keys);
1746
  free(sched->tensor_backend_id);
1747
  free(sched->tensor_copies);
1748
  free(sched->node_backend_ids);
1749
+ free(sched->leaf_backend_ids);
1750
  free(sched);
1751
  }
1752
 
 
1758
  memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1759
 
1760
  sched->is_reset = true;
1761
+ sched->is_alloc = false;
1762
  }
1763
 
1764
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1765
  ggml_backend_sched_split_graph(sched, measure_graph);
1766
 
1767
+ // TODO: extract this to a separate function
1768
+ if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1769
  return false;
1770
  }
1771
 
1772
  ggml_backend_sched_reset(sched);
1773
+ ggml_backend_sched_synchronize(sched);
1774
+
1775
+ return true;
1776
+ }
1777
+
1778
+ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1779
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1780
+
1781
+ ggml_backend_sched_split_graph(sched, graph);
1782
+
1783
+ if (!ggml_backend_sched_alloc_splits(sched)) {
1784
+ return false;
1785
+ }
1786
+
1787
+ sched->is_alloc = true;
1788
+
1789
  return true;
1790
  }
1791
 
1792
  enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1793
+ enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
1794
+ ggml_backend_sched_synchronize(sched);
1795
+ return err;
1796
+ }
1797
 
1798
+ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1799
+ if (!sched->is_reset && !sched->is_alloc) {
1800
  ggml_backend_sched_reset(sched);
1801
  }
1802
 
1803
+ if (!sched->is_alloc) {
1804
+ if (!ggml_backend_sched_alloc_graph(sched, graph)) {
1805
+ return GGML_STATUS_ALLOC_FAILED;
1806
+ }
1807
  }
1808
 
1809
  return ggml_backend_sched_compute_splits(sched);
1810
  }
1811
 
1812
+ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1813
+ for (int i = 0; i < sched->n_backends; i++) {
1814
+ ggml_backend_synchronize(sched->backends[i]);
1815
+ }
1816
+ }
1817
+
1818
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1819
  sched->callback_eval = callback;
1820
  sched->callback_eval_user_data = user_data;
 
1824
  return sched->n_splits;
1825
  }
1826
 
1827
+ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1828
+ return sched->n_copies;
1829
+ }
1830
+
1831
  size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1832
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1833
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1834
+
1835
  return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1836
  }
1837
 
1838
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1839
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1840
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1841
  tensor_backend_id(node) = backend_index;
1842
  }
1843
 
1844
+ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1845
  int backend_index = tensor_backend_id(node);
1846
  if (backend_index == -1) {
1847
  return NULL;
ggml-backend.h CHANGED
@@ -9,6 +9,7 @@ extern "C" {
9
 
10
  typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
11
  typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
 
12
  typedef struct ggml_backend * ggml_backend_t;
13
  typedef void * ggml_backend_graph_plan_t;
14
 
@@ -72,11 +73,24 @@ extern "C" {
72
  GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
73
  GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
74
 
 
75
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
76
 
77
  // tensor copy between different backends
78
  GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
79
- GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  //
82
  // CPU backend
@@ -123,27 +137,31 @@ extern "C" {
123
  /*
124
  Example usage:
125
 
126
- sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
127
- // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
 
128
 
129
- // initialize buffers from a measure graph
130
- measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
131
 
132
- // in build_graph:
133
- build_graph(...) {
134
- // manually assign nodes to a backend (optional, should not be needed in most cases)
135
- struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
136
- ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
137
- }
138
 
139
- // allocate backend buffers from measure graph
140
- ggml_backend_sched_init_measure(sched, measure_graph);
 
141
 
142
- // the scheduler is now ready to compute graphs
143
 
144
  // compute
145
  graph = build_graph(sched);
146
  ggml_backend_sched_graph_compute(sched, graph);
 
 
 
 
 
 
 
147
  */
148
 
149
  struct ggml_backend_sched;
@@ -158,20 +176,26 @@ extern "C" {
158
  typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
159
 
160
  // Initialize a backend scheduler
161
- GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
162
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
 
163
  // Initialize backend buffers from a measure graph
164
  GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
 
165
  // Get the number of splits of the last graph
166
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
 
167
 
168
  GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
169
 
170
- GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
171
- GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
172
 
173
  // Allocate and compute graph on the backend scheduler
 
174
  GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
 
 
175
 
176
  // Reset all assignments and allocators - must be called before changing the node backends
177
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
 
9
 
10
  typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
11
  typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
12
+ typedef struct ggml_backend_event * ggml_backend_event_t;
13
  typedef struct ggml_backend * ggml_backend_t;
14
  typedef void * ggml_backend_graph_plan_t;
15
 
 
73
  GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
74
  GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
 
76
+ GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
77
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
78
 
79
  // tensor copy between different backends
80
  GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
81
+
82
+ // asynchronous copy
83
+ // the copy is performed after all the currently queued operations in backend_src
84
+ // backend_dst will wait for the copy to complete before performing other operations
85
+ // automatic fallback to sync copy if async is not supported
86
+ GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
87
+
88
+ // events
89
+ GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
90
+ GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
91
+ GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
92
+ GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
93
+ GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
94
 
95
  //
96
  // CPU backend
 
137
  /*
138
  Example usage:
139
 
140
+ // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
141
+ // preferrably to run on the same backend as the buffer
142
+ ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
143
 
144
+ sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
 
145
 
146
+ // initialize buffers from a max size graph (optional)
147
+ reserve_graph = build_graph(sched, max_batch_size);
 
 
 
 
148
 
149
+ // manually assign nodes to a backend (optional, should not be needed in most cases)
150
+ struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
151
+ ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
152
 
153
+ ggml_backend_sched_reserve(sched, reserve_graph);
154
 
155
  // compute
156
  graph = build_graph(sched);
157
  ggml_backend_sched_graph_compute(sched, graph);
158
+
159
+ // if there are graph inputs:
160
+ ggml_backend_sched_reset(sched);
161
+ ggml_backend_sched_alloc_graph(sched, graph);
162
+ ggml_backend_tensor_set(input_tensor, ...);
163
+ ggml_backend_sched_graph_compute(sched, graph);
164
+ }
165
  */
166
 
167
  struct ggml_backend_sched;
 
176
  typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
177
 
178
  // Initialize a backend scheduler
179
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
180
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
181
+
182
  // Initialize backend buffers from a measure graph
183
  GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
184
+
185
  // Get the number of splits of the last graph
186
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
187
+ GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
188
 
189
  GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
190
 
191
+ GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
192
+ GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
193
 
194
  // Allocate and compute graph on the backend scheduler
195
+ GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
196
  GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
197
+ GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
198
+ GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
199
 
200
  // Reset all assignments and allocators - must be called before changing the node backends
201
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
ggml-cuda.cu CHANGED
@@ -72,6 +72,7 @@
72
  #define cudaEventCreateWithFlags hipEventCreateWithFlags
73
  #define cudaEventDisableTiming hipEventDisableTiming
74
  #define cudaEventRecord hipEventRecord
 
75
  #define cudaEvent_t hipEvent_t
76
  #define cudaEventDestroy hipEventDestroy
77
  #define cudaFree hipFree
@@ -81,6 +82,7 @@
81
  #define cudaGetDeviceProperties hipGetDeviceProperties
82
  #define cudaGetErrorString hipGetErrorString
83
  #define cudaGetLastError hipGetLastError
 
84
  #ifdef GGML_HIP_UMA
85
  #define cudaMalloc hipMallocManaged
86
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
@@ -104,6 +106,7 @@
104
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
105
  #define cudaStreamFireAndForget hipStreamFireAndForget
106
  #define cudaStreamNonBlocking hipStreamNonBlocking
 
107
  #define cudaStreamSynchronize hipStreamSynchronize
108
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
109
  #define cudaStream_t hipStream_t
@@ -10641,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description,
10641
  #define UNUSED GGML_UNUSED
10642
 
10643
  struct ggml_backend_cuda_context {
 
 
 
 
 
 
 
 
 
 
 
10644
  int device;
10645
  std::string name;
 
10646
  };
10647
 
10648
  // cuda buffer
@@ -10732,9 +10747,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
10732
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10733
 
10734
  ggml_cuda_set_device(ctx->device);
10735
- CUDA_CHECK(cudaDeviceSynchronize());
10736
- CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
10737
- CUDA_CHECK(cudaDeviceSynchronize());
10738
  }
10739
 
10740
  GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -10743,26 +10757,25 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
10743
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10744
 
10745
  ggml_cuda_set_device(ctx->device);
10746
- CUDA_CHECK(cudaDeviceSynchronize());
10747
- CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
10748
- CUDA_CHECK(cudaDeviceSynchronize());
10749
  }
10750
 
10751
  GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
10752
  if (ggml_backend_buffer_is_cuda(src->buffer)) {
10753
  ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
10754
- ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10755
-
10756
- ggml_cuda_set_device(src_ctx->device);
10757
- CUDA_CHECK(cudaDeviceSynchronize());
10758
- ggml_cuda_set_device(dst_ctx->device);
10759
- CUDA_CHECK(cudaDeviceSynchronize());
10760
- CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
10761
- CUDA_CHECK(cudaDeviceSynchronize());
10762
-
10763
  return true;
10764
  }
10765
  return false;
 
 
10766
  }
10767
 
10768
  GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -11007,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
11007
  }
11008
 
11009
  const char * buf_host = (const char *)data + offset_split;
11010
- CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
 
 
 
 
11011
  }
11012
  }
11013
 
@@ -11041,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
11041
  }
11042
 
11043
  char * buf_host = (char *)data + offset_split;
11044
- CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
 
 
 
 
11045
  }
11046
  }
11047
 
@@ -11220,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
11220
  return &ggml_backend_cuda_buffer_type_host;
11221
  }
11222
 
 
 
 
 
11223
  // backend
11224
 
11225
  GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
@@ -11243,8 +11268,9 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer
11243
 
11244
  GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
11245
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
11246
 
11247
- GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11248
  GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
11249
 
11250
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
@@ -11252,22 +11278,61 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
11252
 
11253
  GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
11254
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
11255
 
11256
- GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11257
  GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
11258
 
11259
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
11260
  }
11261
 
11262
- GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
11263
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11264
 
11265
- if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
11266
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
11267
- return true;
 
 
11268
  }
11269
 
11270
- return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11271
  }
11272
 
11273
  GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
@@ -11444,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
11444
  UNUSED(backend);
11445
  }
11446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11447
  static ggml_backend_i ggml_backend_cuda_interface = {
11448
  /* .get_name = */ ggml_backend_cuda_name,
11449
  /* .free = */ ggml_backend_cuda_free,
@@ -11457,6 +11568,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
11457
  /* .graph_plan_compute = */ NULL,
11458
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
11459
  /* .supports_op = */ ggml_backend_cuda_supports_op,
 
 
 
 
 
11460
  };
11461
 
11462
  static ggml_guid_t ggml_backend_cuda_guid() {
@@ -11475,10 +11591,11 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
11475
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
11476
  ggml_cuda_set_main_device(device);
11477
 
11478
- ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
11479
- /* .device = */ device,
11480
- /* .name = */ GGML_CUDA_NAME + std::to_string(device),
11481
- };
 
11482
 
11483
  ggml_backend_t cuda_backend = new ggml_backend {
11484
  /* .guid = */ ggml_backend_cuda_guid(),
 
72
  #define cudaEventCreateWithFlags hipEventCreateWithFlags
73
  #define cudaEventDisableTiming hipEventDisableTiming
74
  #define cudaEventRecord hipEventRecord
75
+ #define cudaEventSynchronize hipEventSynchronize
76
  #define cudaEvent_t hipEvent_t
77
  #define cudaEventDestroy hipEventDestroy
78
  #define cudaFree hipFree
 
82
  #define cudaGetDeviceProperties hipGetDeviceProperties
83
  #define cudaGetErrorString hipGetErrorString
84
  #define cudaGetLastError hipGetLastError
85
+ #define cudaLaunchHostFunc hipLaunchHostFunc
86
  #ifdef GGML_HIP_UMA
87
  #define cudaMalloc hipMallocManaged
88
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
 
106
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
107
  #define cudaStreamFireAndForget hipStreamFireAndForget
108
  #define cudaStreamNonBlocking hipStreamNonBlocking
109
+ #define cudaStreamPerThread hipStreamPerThread
110
  #define cudaStreamSynchronize hipStreamSynchronize
111
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
112
  #define cudaStream_t hipStream_t
 
10644
  #define UNUSED GGML_UNUSED
10645
 
10646
  struct ggml_backend_cuda_context {
10647
+ explicit ggml_backend_cuda_context(int device) :
10648
+ device(device),
10649
+ name(GGML_CUDA_NAME + std::to_string(device)) {
10650
+ }
10651
+
10652
+ ~ggml_backend_cuda_context() {
10653
+ if (copy_event != nullptr) {
10654
+ CUDA_CHECK(cudaEventDestroy(copy_event));
10655
+ }
10656
+ }
10657
+
10658
  int device;
10659
  std::string name;
10660
+ cudaEvent_t copy_event = nullptr;
10661
  };
10662
 
10663
  // cuda buffer
 
10747
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10748
 
10749
  ggml_cuda_set_device(ctx->device);
10750
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
10751
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 
10752
  }
10753
 
10754
  GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 
10757
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10758
 
10759
  ggml_cuda_set_device(ctx->device);
10760
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
10761
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 
10762
  }
10763
 
10764
  GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
10765
  if (ggml_backend_buffer_is_cuda(src->buffer)) {
10766
  ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
10767
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
10768
+ if (src_ctx->device == dst_ctx->device) {
10769
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
10770
+ } else {
10771
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
10772
+ }
10773
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 
 
10774
  return true;
10775
  }
10776
  return false;
10777
+
10778
+ UNUSED(buffer);
10779
  }
10780
 
10781
  GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
 
11020
  }
11021
 
11022
  const char * buf_host = (const char *)data + offset_split;
11023
+ CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
11024
+ }
11025
+
11026
+ for (int id = 0; id < g_device_count; ++id) {
11027
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11028
  }
11029
  }
11030
 
 
11058
  }
11059
 
11060
  char * buf_host = (char *)data + offset_split;
11061
+ CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
11062
+ }
11063
+
11064
+ for (int id = 0; id < g_device_count; ++id) {
11065
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11066
  }
11067
  }
11068
 
 
11241
  return &ggml_backend_cuda_buffer_type_host;
11242
  }
11243
 
11244
+ //static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
11245
+ // return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
11246
+ //}
11247
+
11248
  // backend
11249
 
11250
  GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
 
11268
 
11269
  GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
11270
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11271
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
11272
 
11273
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11274
  GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
11275
 
11276
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
 
11278
 
11279
  GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
11280
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11281
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
11282
 
11283
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11284
  GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
11285
 
11286
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
11287
  }
11288
 
11289
+ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
11290
+ GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
11291
 
11292
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
11293
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
11294
+
11295
+ if (!ggml_backend_buffer_is_cuda(src->buffer)) {
11296
+ return false;
11297
  }
11298
 
11299
+ if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
11300
+ return false;
11301
+ }
11302
+
11303
+ // device -> device
11304
+ ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
11305
+ ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
11306
+
11307
+ if (backend_src != backend_dst) {
11308
+ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
11309
+ ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
11310
+
11311
+ GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
11312
+ GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
11313
+
11314
+ if (!cuda_ctx_src->copy_event) {
11315
+ ggml_cuda_set_device(cuda_ctx_src->device);
11316
+ CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
11317
+ }
11318
+
11319
+ // copy on src stream
11320
+ if (cuda_ctx_src->device == cuda_ctx_dst->device) {
11321
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
11322
+ } else {
11323
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0]));
11324
+ }
11325
+
11326
+ // record event on src stream
11327
+ CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0]));
11328
+
11329
+ // wait on dst stream for the copy to complete
11330
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0));
11331
+ } else {
11332
+ // src and dst are on the same backend
11333
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
11334
+ }
11335
+ return true;
11336
  }
11337
 
11338
  GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
 
11509
  UNUSED(backend);
11510
  }
11511
 
11512
+ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
11513
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11514
+
11515
+ ggml_cuda_set_device(cuda_ctx->device);
11516
+
11517
+ cudaEvent_t event;
11518
+ CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
11519
+
11520
+ return new ggml_backend_event {
11521
+ /* .backend = */ backend,
11522
+ /* .context = */ event,
11523
+ };
11524
+ }
11525
+
11526
+ static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
11527
+ CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
11528
+
11529
+ delete event;
11530
+ }
11531
+
11532
+ static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
11533
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
11534
+
11535
+ CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
11536
+ }
11537
+
11538
+ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
11539
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11540
+
11541
+ if (ggml_backend_is_cuda(event->backend)) {
11542
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
11543
+ } else {
11544
+ // untested
11545
+ auto wait_fn = [](void * user_data) {
11546
+ ggml_backend_event_t event = (ggml_backend_event_t)user_data;
11547
+ ggml_backend_event_synchronize(event);
11548
+ };
11549
+
11550
+ CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
11551
+ }
11552
+ }
11553
+
11554
+ static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
11555
+ CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
11556
+ }
11557
+
11558
  static ggml_backend_i ggml_backend_cuda_interface = {
11559
  /* .get_name = */ ggml_backend_cuda_name,
11560
  /* .free = */ ggml_backend_cuda_free,
 
11568
  /* .graph_plan_compute = */ NULL,
11569
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
11570
  /* .supports_op = */ ggml_backend_cuda_supports_op,
11571
+ /* .event_new = */ ggml_backend_cuda_event_new,
11572
+ /* .event_free = */ ggml_backend_cuda_event_free,
11573
+ /* .event_record = */ ggml_backend_cuda_event_record,
11574
+ /* .event_wait = */ ggml_backend_cuda_event_wait,
11575
+ /* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
11576
  };
11577
 
11578
  static ggml_guid_t ggml_backend_cuda_guid() {
 
11591
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
11592
  ggml_cuda_set_main_device(device);
11593
 
11594
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
11595
+ if (ctx == nullptr) {
11596
+ fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
11597
+ return nullptr;
11598
+ }
11599
 
11600
  ggml_backend_t cuda_backend = new ggml_backend {
11601
  /* .guid = */ ggml_backend_cuda_guid(),
ggml-kompute.cpp CHANGED
@@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
1951
  /* .graph_plan_compute = */ NULL,
1952
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1953
  /* .supports_op = */ ggml_backend_kompute_supports_op,
 
 
 
 
 
1954
  };
1955
 
1956
  static ggml_guid_t ggml_backend_kompute_guid() {
 
1951
  /* .graph_plan_compute = */ NULL,
1952
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1953
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1954
+ /* .event_new = */ NULL,
1955
+ /* .event_free = */ NULL,
1956
+ /* .event_record = */ NULL,
1957
+ /* .event_wait = */ NULL,
1958
+ /* .event_synchronize = */ NULL,
1959
  };
1960
 
1961
  static ggml_guid_t ggml_backend_kompute_guid() {
ggml-metal.m CHANGED
@@ -2820,6 +2820,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
2820
  /* .graph_plan_compute = */ NULL,
2821
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
2822
  /* .supports_op = */ ggml_backend_metal_supports_op,
 
 
 
 
 
2823
  };
2824
 
2825
  void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
 
2820
  /* .graph_plan_compute = */ NULL,
2821
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
2822
  /* .supports_op = */ ggml_backend_metal_supports_op,
2823
+ /* .event_new = */ NULL,
2824
+ /* .event_free = */ NULL,
2825
+ /* .event_record = */ NULL,
2826
+ /* .event_wait = */ NULL,
2827
+ /* .event_synchronize = */ NULL,
2828
  };
2829
 
2830
  void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
ggml-sycl.cpp CHANGED
@@ -17249,13 +17249,18 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17249
  /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
17250
  /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
17251
  /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
17252
- /* .cpy_tensor_async = */ ggml_backend_sycl_cpy_tensor_async,
17253
  /* .synchronize = */ ggml_backend_sycl_synchronize,
17254
  /* .graph_plan_create = */ NULL,
17255
  /* .graph_plan_free = */ NULL,
17256
  /* .graph_plan_compute = */ NULL,
17257
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17258
  /* .supports_op = */ ggml_backend_sycl_supports_op,
 
 
 
 
 
17259
  };
17260
 
17261
  static ggml_guid_t ggml_backend_sycl_guid() {
 
17249
  /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
17250
  /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
17251
  /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
17252
+ /* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
17253
  /* .synchronize = */ ggml_backend_sycl_synchronize,
17254
  /* .graph_plan_create = */ NULL,
17255
  /* .graph_plan_free = */ NULL,
17256
  /* .graph_plan_compute = */ NULL,
17257
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17258
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17259
+ /* .event_new = */ NULL,
17260
+ /* .event_free = */ NULL,
17261
+ /* .event_record = */ NULL,
17262
+ /* .event_wait = */ NULL,
17263
+ /* .event_synchronize = */ NULL,
17264
  };
17265
 
17266
  static ggml_guid_t ggml_backend_sycl_guid() {
ggml-vulkan.cpp CHANGED
@@ -5693,6 +5693,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
5693
  /* .graph_plan_compute = */ NULL,
5694
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
5695
  /* .supports_op = */ ggml_backend_vk_supports_op,
 
 
 
 
 
5696
  };
5697
 
5698
  static ggml_guid_t ggml_backend_vk_guid() {
 
5693
  /* .graph_plan_compute = */ NULL,
5694
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
5695
  /* .supports_op = */ ggml_backend_vk_supports_op,
5696
+ /* .event_new = */ NULL,
5697
+ /* .event_free = */ NULL,
5698
+ /* .event_record = */ NULL,
5699
+ /* .event_wait = */ NULL,
5700
+ /* .event_synchronize = */ NULL,
5701
  };
5702
 
5703
  static ggml_guid_t ggml_backend_vk_guid() {
ggml.c CHANGED
@@ -11560,8 +11560,6 @@ static void ggml_compute_forward_get_rows_q(
11560
  const struct ggml_tensor * src0 = dst->src[0];
11561
  const struct ggml_tensor * src1 = dst->src[1];
11562
 
11563
- assert(params->ith == 0);
11564
-
11565
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11566
  return;
11567
  }
@@ -11569,7 +11567,7 @@ static void ggml_compute_forward_get_rows_q(
11569
  GGML_TENSOR_BINARY_OP_LOCALS
11570
 
11571
  const int64_t nc = ne00;
11572
- const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
11573
 
11574
  const enum ggml_type type = src0->type;
11575
  ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
@@ -11579,17 +11577,25 @@ static void ggml_compute_forward_get_rows_q(
11579
  assert(nb00 == ggml_type_size(type));
11580
  assert(ggml_nrows(dst) == nr);
11581
 
11582
- // TODO: multi-thread
11583
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11584
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11585
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
11586
- const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11587
 
11588
- dequantize_row_q(
11589
- (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11590
- (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11591
- }
11592
- }
 
 
 
 
 
 
 
 
 
 
 
11593
  }
11594
  }
11595
 
@@ -11600,8 +11606,6 @@ static void ggml_compute_forward_get_rows_f16(
11600
  const struct ggml_tensor * src0 = dst->src[0];
11601
  const struct ggml_tensor * src1 = dst->src[1];
11602
 
11603
- assert(params->ith == 0);
11604
-
11605
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11606
  return;
11607
  }
@@ -11609,24 +11613,32 @@ static void ggml_compute_forward_get_rows_f16(
11609
  GGML_TENSOR_BINARY_OP_LOCALS
11610
 
11611
  const int64_t nc = ne00;
11612
- const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
11613
 
11614
  assert(ne0 == nc);
11615
  assert(ne02 == ne11);
11616
  assert(nb00 == sizeof(ggml_fp16_t));
11617
  assert(ggml_nrows(dst) == nr);
11618
 
11619
- // TODO: multi-thread
11620
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11621
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11622
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
11623
- const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11624
 
11625
- ggml_fp16_to_fp32_row(
11626
- (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11627
- (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11628
- }
11629
- }
 
 
 
 
 
 
 
 
11630
  }
11631
  }
11632
 
@@ -11637,8 +11649,6 @@ static void ggml_compute_forward_get_rows_f32(
11637
  const struct ggml_tensor * src0 = dst->src[0];
11638
  const struct ggml_tensor * src1 = dst->src[1];
11639
 
11640
- assert(params->ith == 0);
11641
-
11642
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11643
  return;
11644
  }
@@ -11646,24 +11656,32 @@ static void ggml_compute_forward_get_rows_f32(
11646
  GGML_TENSOR_BINARY_OP_LOCALS
11647
 
11648
  const int64_t nc = ne00;
11649
- const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
11650
 
11651
  assert(ne0 == nc);
11652
  assert(ne02 == ne11);
11653
  assert(nb00 == sizeof(float));
11654
  assert(ggml_nrows(dst) == nr);
11655
 
11656
- // TODO: multi-thread
11657
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11658
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11659
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
11660
- const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11661
 
11662
- ggml_vec_cpy_f32(nc,
11663
- (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
11664
- (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
11665
- }
11666
- }
 
 
 
 
 
 
 
 
 
 
 
11667
  }
11668
  }
11669
 
@@ -17796,7 +17814,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
17796
  node->perf_time_us += time_us_cur;
17797
  }
17798
 
17799
- static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17800
  int n_tasks = 0;
17801
 
17802
  switch (node->op) {
@@ -17877,6 +17895,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17877
  {
17878
  n_tasks = n_threads;
17879
  } break;
 
 
 
 
 
 
17880
  case GGML_OP_SCALE:
17881
  case GGML_OP_SET:
17882
  case GGML_OP_CONT:
@@ -17884,7 +17908,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17884
  case GGML_OP_VIEW:
17885
  case GGML_OP_PERMUTE:
17886
  case GGML_OP_TRANSPOSE:
17887
- case GGML_OP_GET_ROWS:
17888
  case GGML_OP_GET_ROWS_BACK:
17889
  case GGML_OP_DIAG:
17890
  {
@@ -18102,7 +18125,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
18102
  /* FINALIZE */
18103
  struct ggml_tensor * node = cgraph->nodes[node_n];
18104
  if (GGML_OP_HAS_FINALIZE[node->op]) {
18105
- params.nth = ggml_get_n_tasks(node, n_threads);
18106
  ggml_compute_forward(&params, node);
18107
  }
18108
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -18112,7 +18135,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
18112
  while (++node_n < cgraph->n_nodes) {
18113
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
18114
  struct ggml_tensor * node = cgraph->nodes[node_n];
18115
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18116
 
18117
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
18118
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -18160,7 +18183,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
18160
 
18161
  /* INIT & COMPUTE */
18162
  struct ggml_tensor * node = cgraph->nodes[node_n];
18163
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18164
 
18165
  struct ggml_compute_params params = {
18166
  /*.type =*/ GGML_TASK_TYPE_INIT,
@@ -18225,7 +18248,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18225
  for (int i = 0; i < cgraph->n_nodes; i++) {
18226
  struct ggml_tensor * node = cgraph->nodes[i];
18227
 
18228
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18229
 
18230
  max_tasks = MAX(max_tasks, n_tasks);
18231
 
 
11560
  const struct ggml_tensor * src0 = dst->src[0];
11561
  const struct ggml_tensor * src1 = dst->src[1];
11562
 
 
 
11563
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11564
  return;
11565
  }
 
11567
  GGML_TENSOR_BINARY_OP_LOCALS
11568
 
11569
  const int64_t nc = ne00;
11570
+ const int64_t nr = ggml_nelements(src1);
11571
 
11572
  const enum ggml_type type = src0->type;
11573
  ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
 
11577
  assert(nb00 == ggml_type_size(type));
11578
  assert(ggml_nrows(dst) == nr);
11579
 
11580
+ const int ith = params->ith;
11581
+ const int nth = params->nth;
 
 
 
11582
 
11583
+ // rows per thread
11584
+ const int dr = (nr + nth - 1)/nth;
11585
+
11586
+ // row range for this thread
11587
+ const int ir0 = dr*ith;
11588
+ const int ir1 = MIN(ir0 + dr, nr);
11589
+
11590
+ for (int64_t i = ir0; i < ir1; ++i) {
11591
+ const int64_t i12 = i/(ne11*ne10);
11592
+ const int64_t i11 = (i - i12*ne11*ne10)/ne10;
11593
+ const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
11594
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11595
+
11596
+ dequantize_row_q(
11597
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11598
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11599
  }
11600
  }
11601
 
 
11606
  const struct ggml_tensor * src0 = dst->src[0];
11607
  const struct ggml_tensor * src1 = dst->src[1];
11608
 
 
 
11609
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11610
  return;
11611
  }
 
11613
  GGML_TENSOR_BINARY_OP_LOCALS
11614
 
11615
  const int64_t nc = ne00;
11616
+ const int64_t nr = ggml_nelements(src1);
11617
 
11618
  assert(ne0 == nc);
11619
  assert(ne02 == ne11);
11620
  assert(nb00 == sizeof(ggml_fp16_t));
11621
  assert(ggml_nrows(dst) == nr);
11622
 
11623
+ const int ith = params->ith;
11624
+ const int nth = params->nth;
11625
+
11626
+ // rows per thread
11627
+ const int dr = (nr + nth - 1)/nth;
11628
 
11629
+ // row range for this thread
11630
+ const int ir0 = dr*ith;
11631
+ const int ir1 = MIN(ir0 + dr, nr);
11632
+
11633
+ for (int64_t i = ir0; i < ir1; ++i) {
11634
+ const int64_t i12 = i/(ne11*ne10);
11635
+ const int64_t i11 = (i - i12*ne11*ne10)/ne10;
11636
+ const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
11637
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11638
+
11639
+ ggml_fp16_to_fp32_row(
11640
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11641
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11642
  }
11643
  }
11644
 
 
11649
  const struct ggml_tensor * src0 = dst->src[0];
11650
  const struct ggml_tensor * src1 = dst->src[1];
11651
 
 
 
11652
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11653
  return;
11654
  }
 
11656
  GGML_TENSOR_BINARY_OP_LOCALS
11657
 
11658
  const int64_t nc = ne00;
11659
+ const int64_t nr = ggml_nelements(src1);
11660
 
11661
  assert(ne0 == nc);
11662
  assert(ne02 == ne11);
11663
  assert(nb00 == sizeof(float));
11664
  assert(ggml_nrows(dst) == nr);
11665
 
11666
+ const int ith = params->ith;
11667
+ const int nth = params->nth;
 
 
 
11668
 
11669
+ // rows per thread
11670
+ const int dr = (nr + nth - 1)/nth;
11671
+
11672
+ // row range for this thread
11673
+ const int ir0 = dr*ith;
11674
+ const int ir1 = MIN(ir0 + dr, nr);
11675
+
11676
+ for (int64_t i = ir0; i < ir1; ++i) {
11677
+ const int64_t i12 = i/(ne11*ne10);
11678
+ const int64_t i11 = (i - i12*ne11*ne10)/ne10;
11679
+ const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
11680
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11681
+
11682
+ ggml_vec_cpy_f32(nc,
11683
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
11684
+ (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
11685
  }
11686
  }
11687
 
 
17814
  node->perf_time_us += time_us_cur;
17815
  }
17816
 
17817
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
17818
  int n_tasks = 0;
17819
 
17820
  switch (node->op) {
 
17895
  {
17896
  n_tasks = n_threads;
17897
  } break;
17898
+ case GGML_OP_GET_ROWS:
17899
+ {
17900
+ // FIXME: the cost of launching additional threads decreases performance with GPU offloading
17901
+ //n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
17902
+ n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
17903
+ } break;
17904
  case GGML_OP_SCALE:
17905
  case GGML_OP_SET:
17906
  case GGML_OP_CONT:
 
17908
  case GGML_OP_VIEW:
17909
  case GGML_OP_PERMUTE:
17910
  case GGML_OP_TRANSPOSE:
 
17911
  case GGML_OP_GET_ROWS_BACK:
17912
  case GGML_OP_DIAG:
17913
  {
 
18125
  /* FINALIZE */
18126
  struct ggml_tensor * node = cgraph->nodes[node_n];
18127
  if (GGML_OP_HAS_FINALIZE[node->op]) {
18128
+ params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
18129
  ggml_compute_forward(&params, node);
18130
  }
18131
  ggml_graph_compute_perf_stats_node(node, state->shared);
 
18135
  while (++node_n < cgraph->n_nodes) {
18136
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
18137
  struct ggml_tensor * node = cgraph->nodes[node_n];
18138
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
18139
 
18140
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
18141
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
 
18183
 
18184
  /* INIT & COMPUTE */
18185
  struct ggml_tensor * node = cgraph->nodes[node_n];
18186
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
18187
 
18188
  struct ggml_compute_params params = {
18189
  /*.type =*/ GGML_TASK_TYPE_INIT,
 
18248
  for (int i = 0; i < cgraph->n_nodes; i++) {
18249
  struct ggml_tensor * node = cgraph->nodes[i];
18250
 
18251
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);
18252
 
18253
  max_tasks = MAX(max_tasks, n_tasks);
18254