ggerganov commited on
Commit
d6abb6a
·
unverified ·
1 Parent(s): 2305485

metal : remove old API (llama/4919)

Browse files
Files changed (2) hide show
  1. ggml-metal.h +2 -53
  2. ggml-metal.m +23 -253
ggml-metal.h CHANGED
@@ -36,64 +36,13 @@ struct ggml_cgraph;
36
  extern "C" {
37
  #endif
38
 
39
- //
40
- // internal API
41
- // temporary exposed to user-code
42
- //
43
-
44
- struct ggml_metal_context;
45
-
46
- void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
47
-
48
- // number of command buffers to use
49
- struct ggml_metal_context * ggml_metal_init(int n_cb);
50
- void ggml_metal_free(struct ggml_metal_context * ctx);
51
-
52
- void * ggml_metal_host_malloc(size_t n);
53
- void ggml_metal_host_free (void * data);
54
-
55
- // set the number of command buffers to use
56
- void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
57
-
58
- // creates a mapping between a host memory buffer and a device memory buffer
59
- // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
60
- // - the mapping is used during computation to determine the arguments of the compute kernels
61
- // - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
62
- // - max_size specifies the maximum size of a tensor and is used to create shared views such
63
- // that it is guaranteed that the tensor will fit in at least one of the views
64
- //
65
- bool ggml_metal_add_buffer(
66
- struct ggml_metal_context * ctx,
67
- const char * name,
68
- void * data,
69
- size_t size,
70
- size_t max_size);
71
-
72
- // set data from host memory into the device
73
- void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
74
-
75
- // get data from the device into host memory
76
- void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
77
-
78
- // try to find operations that can be run concurrently in the graph
79
- // you should run it again if the topology of your graph changes
80
- void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
81
-
82
- // if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
83
- int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
84
-
85
- // output the concur_list for ggml_alloc
86
- int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
87
-
88
- // same as ggml_graph_compute but uses Metal
89
- // creates gf->n_threads command buffers in parallel
90
- bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
91
-
92
  //
93
  // backend API
94
  // user-code should use only these functions
95
  //
96
 
 
 
97
  GGML_API ggml_backend_t ggml_backend_metal_init(void);
98
 
99
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
36
  extern "C" {
37
  #endif
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  //
40
  // backend API
41
  // user-code should use only these functions
42
  //
43
 
44
+ GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
45
+
46
  GGML_API ggml_backend_t ggml_backend_metal_init(void);
47
 
48
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
ggml-metal.m CHANGED
@@ -24,8 +24,6 @@
24
 
25
  #define UNUSED(x) (void)(x)
26
 
27
- #define GGML_MAX_CONCUR (2*GGML_DEFAULT_GRAPH_SIZE)
28
-
29
  #define GGML_METAL_MAX_KERNELS 256
30
 
31
  struct ggml_metal_buffer {
@@ -182,9 +180,6 @@ struct ggml_metal_context {
182
 
183
  struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
184
 
185
- int concur_list[GGML_MAX_CONCUR];
186
- int concur_list_len;
187
-
188
  bool support_simdgroup_reduction;
189
  bool support_simdgroup_mm;
190
  };
@@ -200,7 +195,6 @@ struct ggml_metal_context {
200
  @implementation GGMLMetalClass
201
  @end
202
 
203
-
204
  static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
205
  fprintf(stderr, "%s", msg);
206
 
@@ -211,11 +205,6 @@ static void ggml_metal_default_log_callback(enum ggml_log_level level, const cha
211
  ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
212
  void * ggml_metal_log_user_data = NULL;
213
 
214
- void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
215
- ggml_metal_log_callback = log_callback;
216
- ggml_metal_log_user_data = user_data;
217
- }
218
-
219
  GGML_ATTRIBUTE_FORMAT(2, 3)
220
  static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
221
  if (ggml_metal_log_callback != NULL) {
@@ -238,7 +227,18 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
238
  }
239
  }
240
 
241
- struct ggml_metal_context * ggml_metal_init(int n_cb) {
 
 
 
 
 
 
 
 
 
 
 
242
  GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
243
 
244
  id<MTLDevice> device;
@@ -264,7 +264,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
264
  ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
265
  ctx->queue = [ctx->device newCommandQueue];
266
  ctx->n_buffers = 0;
267
- ctx->concur_list_len = 0;
268
 
269
  ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
270
 
@@ -531,7 +530,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
531
  return ctx;
532
  }
533
 
534
- void ggml_metal_free(struct ggml_metal_context * ctx) {
535
  GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
536
 
537
  for (int i = 0; i < ctx->n_buffers; ++i) {
@@ -557,33 +556,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
557
  free(ctx);
558
  }
559
 
560
- void * ggml_metal_host_malloc(size_t n) {
561
- void * data = NULL;
562
- const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
563
- if (result != 0) {
564
- GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
565
- return NULL;
566
- }
567
-
568
- return data;
569
- }
570
-
571
- void ggml_metal_host_free(void * data) {
572
- free(data);
573
- }
574
-
575
- void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
576
- ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
577
- }
578
-
579
- int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
580
- return ctx->concur_list_len;
581
- }
582
-
583
- int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
584
- return ctx->concur_list;
585
- }
586
-
587
  // temporarily defined here for compatibility between ggml-backend and the old API
588
 
589
  struct ggml_backend_metal_buffer {
@@ -656,209 +628,6 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
656
  return nil;
657
  }
658
 
659
- bool ggml_metal_add_buffer(
660
- struct ggml_metal_context * ctx,
661
- const char * name,
662
- void * data,
663
- size_t size,
664
- size_t max_size) {
665
- if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
666
- GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__);
667
- return false;
668
- }
669
-
670
- if (data) {
671
- // verify that the buffer does not overlap with any of the existing buffers
672
- for (int i = 0; i < ctx->n_buffers; ++i) {
673
- const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
674
-
675
- if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
676
- GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
677
- return false;
678
- }
679
- }
680
-
681
- const size_t size_page = sysconf(_SC_PAGESIZE);
682
-
683
- size_t size_aligned = size;
684
- if ((size_aligned % size_page) != 0) {
685
- size_aligned += (size_page - (size_aligned % size_page));
686
- }
687
-
688
- // the buffer fits into the max buffer size allowed by the device
689
- if (size_aligned <= ctx->device.maxBufferLength) {
690
- ctx->buffers[ctx->n_buffers].name = name;
691
- ctx->buffers[ctx->n_buffers].data = data;
692
- ctx->buffers[ctx->n_buffers].size = size;
693
-
694
- ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
695
-
696
- if (ctx->buffers[ctx->n_buffers].metal == nil) {
697
- GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
698
- return false;
699
- }
700
-
701
- GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
702
-
703
- ++ctx->n_buffers;
704
- } else {
705
- // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
706
- // one of the views
707
- const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
708
- const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
709
- const size_t size_view = ctx->device.maxBufferLength;
710
-
711
- for (size_t i = 0; i < size; i += size_step) {
712
- const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
713
-
714
- ctx->buffers[ctx->n_buffers].name = name;
715
- ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
716
- ctx->buffers[ctx->n_buffers].size = size_step_aligned;
717
-
718
- ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
719
-
720
- if (ctx->buffers[ctx->n_buffers].metal == nil) {
721
- GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
722
- return false;
723
- }
724
-
725
- GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
726
- if (i + size_step < size) {
727
- GGML_METAL_LOG_INFO("\n");
728
- }
729
-
730
- ++ctx->n_buffers;
731
- }
732
- }
733
-
734
- #if TARGET_OS_OSX
735
- GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
736
- ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
737
- ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
738
-
739
- if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
740
- GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
741
- } else {
742
- GGML_METAL_LOG_INFO("\n");
743
- }
744
- #else
745
- GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
746
- #endif
747
- }
748
-
749
- return true;
750
- }
751
-
752
- void ggml_metal_set_tensor(
753
- struct ggml_metal_context * ctx,
754
- struct ggml_tensor * t) {
755
- size_t offs;
756
- id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
757
-
758
- memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
759
- }
760
-
761
- void ggml_metal_get_tensor(
762
- struct ggml_metal_context * ctx,
763
- struct ggml_tensor * t) {
764
- size_t offs;
765
- id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
766
-
767
- memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
768
- }
769
-
770
- void ggml_metal_graph_find_concurrency(
771
- struct ggml_metal_context * ctx,
772
- struct ggml_cgraph * gf, bool check_mem) {
773
- int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
774
- int nodes_unused[GGML_MAX_CONCUR];
775
-
776
- for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
777
- for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; }
778
- ctx->concur_list_len = 0;
779
-
780
- int n_left = gf->n_nodes;
781
- int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
782
- int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
783
-
784
- while (n_left > 0) {
785
- // number of nodes at a layer (that can be issued concurrently)
786
- int concurrency = 0;
787
- for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
788
- if (nodes_unused[i]) {
789
- // if the requirements for gf->nodes[i] are satisfied
790
- int exe_flag = 1;
791
-
792
- // scan all srcs
793
- for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
794
- struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
795
- if (src_cur) {
796
- // if is leaf nodes it's satisfied.
797
- // TODO: ggml_is_leaf()
798
- if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
799
- continue;
800
- }
801
-
802
- // otherwise this src should be the output from previous nodes.
803
- int is_found = 0;
804
-
805
- // scan 2*search_depth back because we inserted barrier.
806
- //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
807
- for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
808
- if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
809
- is_found = 1;
810
- break;
811
- }
812
- }
813
- if (is_found == 0) {
814
- exe_flag = 0;
815
- break;
816
- }
817
- }
818
- }
819
- if (exe_flag && check_mem) {
820
- // check if nodes[i]'s data will be overwritten by a node before nodes[i].
821
- // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
822
- int64_t data_start = (int64_t) gf->nodes[i]->data;
823
- int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
824
- for (int j = n_start; j < i; j++) {
825
- if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
826
- && gf->nodes[j]->op != GGML_OP_VIEW \
827
- && gf->nodes[j]->op != GGML_OP_TRANSPOSE \
828
- && gf->nodes[j]->op != GGML_OP_PERMUTE) {
829
- if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
830
- ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
831
- continue;
832
- }
833
-
834
- exe_flag = 0;
835
- }
836
- }
837
- }
838
- if (exe_flag) {
839
- ctx->concur_list[level_pos + concurrency] = i;
840
- nodes_unused[i] = 0;
841
- concurrency++;
842
- ctx->concur_list_len++;
843
- }
844
- }
845
- }
846
- n_left -= concurrency;
847
- // adding a barrier different layer
848
- ctx->concur_list[level_pos + concurrency] = -1;
849
- ctx->concur_list_len++;
850
- // jump all sorted nodes at nodes_bak
851
- while (!nodes_unused[n_start]) {
852
- n_start++;
853
- }
854
- level_pos += concurrency + 1;
855
- }
856
-
857
- if (ctx->concur_list_len > GGML_MAX_CONCUR) {
858
- GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
859
- }
860
- }
861
-
862
  static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
863
  switch (op->op) {
864
  case GGML_OP_UNARY:
@@ -940,19 +709,15 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
940
  }
941
  }
942
 
943
- bool ggml_metal_graph_compute(
944
  struct ggml_metal_context * ctx,
945
  struct ggml_cgraph * gf) {
946
  @autoreleasepool {
947
 
948
- // if there is ctx->concur_list, dispatch concurrently
949
- // else fallback to serial dispatch
950
  MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
951
 
952
- const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
953
-
954
- const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
955
- edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
956
 
957
  // create multiple command buffers and enqueue them
958
  // then, we encode the graph into the command buffers in parallel
@@ -983,7 +748,7 @@ bool ggml_metal_graph_compute(
983
  const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
984
 
985
  for (int ind = node_start; ind < node_end; ++ind) {
986
- const int i = has_concur ? ctx->concur_list[ind] : ind;
987
 
988
  if (i == -1) {
989
  [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
@@ -2823,6 +2588,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
2823
  /* .supports_op = */ ggml_backend_metal_supports_op,
2824
  };
2825
 
 
 
 
 
 
2826
  ggml_backend_t ggml_backend_metal_init(void) {
2827
  struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
2828
 
@@ -2849,7 +2619,7 @@ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
2849
 
2850
  struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
2851
 
2852
- ggml_metal_set_n_cb(ctx, n_cb);
2853
  }
2854
 
2855
  bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
 
24
 
25
  #define UNUSED(x) (void)(x)
26
 
 
 
27
  #define GGML_METAL_MAX_KERNELS 256
28
 
29
  struct ggml_metal_buffer {
 
180
 
181
  struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
182
 
 
 
 
183
  bool support_simdgroup_reduction;
184
  bool support_simdgroup_mm;
185
  };
 
195
  @implementation GGMLMetalClass
196
  @end
197
 
 
198
  static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
199
  fprintf(stderr, "%s", msg);
200
 
 
205
  ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
206
  void * ggml_metal_log_user_data = NULL;
207
 
 
 
 
 
 
208
  GGML_ATTRIBUTE_FORMAT(2, 3)
209
  static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
210
  if (ggml_metal_log_callback != NULL) {
 
227
  }
228
  }
229
 
230
+ static void * ggml_metal_host_malloc(size_t n) {
231
+ void * data = NULL;
232
+ const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
233
+ if (result != 0) {
234
+ GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
235
+ return NULL;
236
+ }
237
+
238
+ return data;
239
+ }
240
+
241
+ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
242
  GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
243
 
244
  id<MTLDevice> device;
 
264
  ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
265
  ctx->queue = [ctx->device newCommandQueue];
266
  ctx->n_buffers = 0;
 
267
 
268
  ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
269
 
 
530
  return ctx;
531
  }
532
 
533
+ static void ggml_metal_free(struct ggml_metal_context * ctx) {
534
  GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
535
 
536
  for (int i = 0; i < ctx->n_buffers; ++i) {
 
556
  free(ctx);
557
  }
558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
  // temporarily defined here for compatibility between ggml-backend and the old API
560
 
561
  struct ggml_backend_metal_buffer {
 
628
  return nil;
629
  }
630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
632
  switch (op->op) {
633
  case GGML_OP_UNARY:
 
709
  }
710
  }
711
 
712
+ static bool ggml_metal_graph_compute(
713
  struct ggml_metal_context * ctx,
714
  struct ggml_cgraph * gf) {
715
  @autoreleasepool {
716
 
 
 
717
  MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
718
 
719
+ const int n_nodes = gf->n_nodes;
720
+ edesc.dispatchType = MTLDispatchTypeSerial;
 
 
721
 
722
  // create multiple command buffers and enqueue them
723
  // then, we encode the graph into the command buffers in parallel
 
748
  const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
749
 
750
  for (int ind = node_start; ind < node_end; ++ind) {
751
+ const int i = ind;
752
 
753
  if (i == -1) {
754
  [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
 
2588
  /* .supports_op = */ ggml_backend_metal_supports_op,
2589
  };
2590
 
2591
+ void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
2592
+ ggml_metal_log_callback = log_callback;
2593
+ ggml_metal_log_user_data = user_data;
2594
+ }
2595
+
2596
  ggml_backend_t ggml_backend_metal_init(void) {
2597
  struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
2598
 
 
2619
 
2620
  struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
2621
 
2622
+ ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
2623
  }
2624
 
2625
  bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {