Diego Devesa commited on
Commit
582a21e
·
1 Parent(s): 474cc59

llama : refactor model loader with backend registry (llama/10026)

Browse files
ggml/include/ggml-backend.h CHANGED
@@ -114,11 +114,12 @@ extern "C" {
114
  //
115
 
116
  enum ggml_backend_dev_type {
 
117
  GGML_BACKEND_DEVICE_TYPE_CPU,
 
118
  GGML_BACKEND_DEVICE_TYPE_GPU,
119
- // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
120
- GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
121
- GGML_BACKEND_DEVICE_TYPE_GPU_FULL
122
  };
123
 
124
  // functionality supported by the device
@@ -167,10 +168,14 @@ extern "C" {
167
  GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
168
  GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
169
 
 
170
 
171
- // Functions that may be obtained using ggml_backend_reg_get_proc_address
172
- typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
173
- typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
 
 
 
174
 
175
  //
176
  // Backend registry
@@ -192,7 +197,7 @@ extern "C" {
192
  GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
193
  // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
194
  GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
195
- // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
196
  GGML_API ggml_backend_t ggml_backend_init_best(void);
197
 
198
  //
 
114
  //
115
 
116
  enum ggml_backend_dev_type {
117
+ // CPU device using system memory
118
  GGML_BACKEND_DEVICE_TYPE_CPU,
119
+ // GPU device using dedicated memory
120
  GGML_BACKEND_DEVICE_TYPE_GPU,
121
+ // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
122
+ GGML_BACKEND_DEVICE_TYPE_ACCEL
 
123
  };
124
 
125
  // functionality supported by the device
 
168
  GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
169
  GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
170
 
171
+ // Common functions that may be obtained using ggml_backend_reg_get_proc_address
172
 
173
+ // Split buffer type for tensor parallelism
174
+ typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
175
+ // Set the number of threads for the backend
176
+ typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
177
+ // Get additional buffer types provided by the device (returns a NULL-terminated array)
178
+ typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
179
 
180
  //
181
  // Backend registry
 
197
  GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
198
  // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
199
  GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
200
+ // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
201
  GGML_API ggml_backend_t ggml_backend_init_best(void);
202
 
203
  //
ggml/include/ggml-cuda.h CHANGED
@@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
28
  GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
29
 
30
  // split tensor buffer that splits matrices by rows across multiple devices
31
- GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
32
 
33
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
34
  GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
 
28
  GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
29
 
30
  // split tensor buffer that splits matrices by rows across multiple devices
31
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
32
 
33
  // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
34
  GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
ggml/src/ggml-amx.cpp CHANGED
@@ -16,12 +16,6 @@
16
  #if defined(__AMX_INT8__)
17
 
18
  // AMX buffer interface
19
- static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
20
- return "AMX";
21
-
22
- GGML_UNUSED(buffer);
23
- }
24
-
25
  static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
26
  free(buffer->context);
27
  }
@@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
72
  }
73
 
74
  static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
75
- /* .get_name = */ ggml_backend_amx_buffer_get_name,
76
  /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
77
  /* .get_base = */ ggml_backend_amx_buffer_get_base,
78
  /* .init_tensor = */ NULL, // no initialization required
@@ -121,14 +114,14 @@ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft
121
  ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
122
  static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
123
  /* .iface = */ {
124
- /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
125
- /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
126
- /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
127
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
128
- /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
129
- /* .is_host = */ ggml_backend_amx_buffer_type_is_host,
130
  },
131
- /* .device = */ NULL,
132
  /* .context = */ NULL,
133
  };
134
 
@@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
149
  delete backend;
150
  }
151
 
152
- static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
153
- return ggml_backend_amx_buffer_type();
154
-
155
- GGML_UNUSED(backend);
156
- }
157
-
158
  static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
159
  ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
160
 
@@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
187
  static struct ggml_backend_i ggml_backend_amx_i = {
188
  /* .get_name = */ ggml_backend_amx_name,
189
  /* .free = */ ggml_backend_amx_free,
190
- /* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
191
  /* .set_tensor_async = */ NULL,
192
  /* .get_tensor_async = */ NULL,
193
  /* .cpy_tensor_async = */ NULL,
@@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
197
  /* .graph_plan_update = */ NULL,
198
  /* .graph_plan_compute = */ NULL,
199
  /* .graph_compute = */ ggml_backend_amx_graph_compute,
200
- /* .supports_op = */ NULL,
201
- /* .supports_buft = */ NULL,
202
- /* .offload_op = */ NULL,
203
  /* .event_record = */ NULL,
204
  /* .event_wait = */ NULL,
205
  };
@@ -279,7 +262,7 @@ static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t *
279
  }
280
 
281
  static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
282
- return GGML_BACKEND_DEVICE_TYPE_CPU;
283
 
284
  GGML_UNUSED(dev);
285
  }
 
16
  #if defined(__AMX_INT8__)
17
 
18
  // AMX buffer interface
 
 
 
 
 
 
19
  static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
20
  free(buffer->context);
21
  }
 
66
  }
67
 
68
  static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
 
69
  /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
70
  /* .get_base = */ ggml_backend_amx_buffer_get_base,
71
  /* .init_tensor = */ NULL, // no initialization required
 
114
  ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
115
  static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
116
  /* .iface = */ {
117
+ /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
118
+ /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
119
+ /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
120
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
121
+ /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
122
+ /* .is_host = */ ggml_backend_amx_buffer_type_is_host,
123
  },
124
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
125
  /* .context = */ NULL,
126
  };
127
 
 
142
  delete backend;
143
  }
144
 
 
 
 
 
 
 
145
  static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
146
  ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
147
 
 
174
  static struct ggml_backend_i ggml_backend_amx_i = {
175
  /* .get_name = */ ggml_backend_amx_name,
176
  /* .free = */ ggml_backend_amx_free,
 
177
  /* .set_tensor_async = */ NULL,
178
  /* .get_tensor_async = */ NULL,
179
  /* .cpy_tensor_async = */ NULL,
 
183
  /* .graph_plan_update = */ NULL,
184
  /* .graph_plan_compute = */ NULL,
185
  /* .graph_compute = */ ggml_backend_amx_graph_compute,
 
 
 
186
  /* .event_record = */ NULL,
187
  /* .event_wait = */ NULL,
188
  };
 
262
  }
263
 
264
  static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
265
+ return GGML_BACKEND_DEVICE_TYPE_ACCEL;
266
 
267
  GGML_UNUSED(dev);
268
  }
ggml/src/ggml-backend-impl.h CHANGED
@@ -22,7 +22,7 @@ extern "C" {
22
  size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
23
  // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
24
  size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25
- // (optional) check if tensor data is in host memory (defaults to false)
26
  bool (*is_host) (ggml_backend_buffer_type_t buft);
27
  };
28
 
@@ -37,7 +37,6 @@ extern "C" {
37
  //
38
 
39
  struct ggml_backend_buffer_i {
40
- const char * (*get_name) (ggml_backend_buffer_t buffer);
41
  // (optional) free the buffer
42
  void (*free_buffer) (ggml_backend_buffer_t buffer);
43
  // base address of the buffer
@@ -88,19 +87,16 @@ extern "C" {
88
 
89
  void (*free)(ggml_backend_t backend);
90
 
91
- // Will be moved to the device interface
92
- // buffer allocation
93
- ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
94
-
95
  // (optional) asynchronous tensor data access
96
  void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
97
  void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
98
  bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
99
 
100
- // (optional) complete all pending operations
101
  void (*synchronize)(ggml_backend_t backend);
102
 
103
- // (optional) compute graph with a plan (not used currently)
 
104
  ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
105
  void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
106
  // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@@ -111,13 +107,6 @@ extern "C" {
111
  // compute graph (always async if supported by the backend)
112
  enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
113
 
114
- // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
115
- // new backends should implement the device interface instead
116
- // These functions are being moved to the device interface
117
- bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
118
- bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
119
- bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
120
-
121
  // (optional) event synchronization
122
  // record an event on this stream
123
  void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
 
22
  size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
23
  // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
24
  size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25
+ // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
26
  bool (*is_host) (ggml_backend_buffer_type_t buft);
27
  };
28
 
 
37
  //
38
 
39
  struct ggml_backend_buffer_i {
 
40
  // (optional) free the buffer
41
  void (*free_buffer) (ggml_backend_buffer_t buffer);
42
  // base address of the buffer
 
87
 
88
  void (*free)(ggml_backend_t backend);
89
 
 
 
 
 
90
  // (optional) asynchronous tensor data access
91
  void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
92
  void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
93
  bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
94
 
95
+ // (optional) complete all pending operations (required if the backend supports async operations)
96
  void (*synchronize)(ggml_backend_t backend);
97
 
98
+ // (optional) graph plans (not used currently)
99
+ // compute graph with a plan
100
  ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
101
  void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
102
  // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
 
107
  // compute graph (always async if supported by the backend)
108
  enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
109
 
 
 
 
 
 
 
 
110
  // (optional) event synchronization
111
  // record an event on this stream
112
  void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
ggml/src/ggml-backend.cpp CHANGED
@@ -34,6 +34,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
34
  }
35
 
36
  ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 
 
 
 
 
37
  return buft->iface.alloc_buffer(buft, size);
38
  }
39
 
@@ -89,7 +94,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
89
  }
90
 
91
  const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
92
- return buffer->iface.get_name(buffer);
93
  }
94
 
95
  void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -108,6 +113,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
108
  }
109
 
110
  void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
 
 
 
 
 
111
  void * base = buffer->iface.get_base(buffer);
112
 
113
  GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -122,6 +132,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
122
  }
123
  }
124
 
 
 
 
 
 
 
 
 
 
125
  size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
126
  return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
127
  }
@@ -134,10 +153,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
134
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
135
  }
136
 
137
- void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
138
- buffer->iface.clear(buffer, value);
139
- }
140
-
141
  bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
142
  return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
143
  }
@@ -198,7 +213,7 @@ void ggml_backend_free(ggml_backend_t backend) {
198
  }
199
 
200
  ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
201
- return backend->iface.get_default_buffer_type(backend);
202
  }
203
 
204
  ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
@@ -238,43 +253,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
238
  void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
239
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
240
 
 
 
 
 
241
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
242
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
243
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
244
 
245
- if (!size) {
246
- return;
247
- }
248
-
249
  buf->iface.set_tensor(buf, tensor, data, offset, size);
250
  }
251
 
252
  void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
253
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
254
 
 
 
 
 
255
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
256
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
258
 
259
- if (!size) {
260
- return;
261
- }
262
-
263
  buf->iface.get_tensor(buf, tensor, data, offset, size);
264
  }
265
 
266
  GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
267
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
268
 
269
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
270
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
271
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
272
-
273
- if (!size) {
274
  return;
275
  }
276
 
277
- GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
 
 
 
278
 
279
  buf->iface.memset_tensor(buf, tensor, value, offset, size);
280
  }
@@ -316,32 +330,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
316
  }
317
 
318
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
319
- // helper to ease transition to device interface
320
- if (backend->device) {
321
- return ggml_backend_dev_supports_op(backend->device, op);
322
- }
323
-
324
- return backend->iface.supports_op(backend, op);
325
  }
326
 
327
  bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
328
- // helper to ease transition to device interface
329
- if (backend->device) {
330
- return ggml_backend_dev_supports_buft(backend->device, buft);
331
- }
332
- return backend->iface.supports_buft(backend, buft);
333
  }
334
 
335
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
336
- // helper to ease transition to device interface
337
- if (backend->device) {
338
- return ggml_backend_dev_offload_op(backend->device, op);
339
- }
340
-
341
- if (backend->iface.offload_op != NULL) {
342
- return backend->iface.offload_op(backend, op);
343
- }
344
- return false;
345
  }
346
 
347
  ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
@@ -582,6 +579,9 @@ struct ggml_backend_registry {
582
  #ifdef GGML_USE_VULKAN
583
  register_backend(ggml_backend_vk_reg());
584
  #endif
 
 
 
585
  #ifdef GGML_USE_BLAS
586
  register_backend(ggml_backend_blas_reg());
587
  #endif
@@ -591,9 +591,6 @@ struct ggml_backend_registry {
591
  #ifdef GGML_USE_AMX
592
  register_backend(ggml_backend_amx_reg());
593
  #endif
594
- #ifdef GGML_USE_CANN
595
- register_backend(ggml_backend_cann_reg());
596
- #endif
597
 
598
  // TODO: kompute
599
 
@@ -701,9 +698,9 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
701
  }
702
 
703
  ggml_backend_t ggml_backend_init_best(void) {
704
- ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
705
  if (!dev) {
706
- dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
707
  }
708
  if (!dev) {
709
  return NULL;
@@ -711,13 +708,7 @@ ggml_backend_t ggml_backend_init_best(void) {
711
  return ggml_backend_dev_init(dev, NULL);
712
  }
713
 
714
- // backend CPU
715
-
716
- static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
717
- return "CPU";
718
-
719
- GGML_UNUSED(buffer);
720
- }
721
 
722
  static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
723
  uintptr_t data = (uintptr_t)buffer->context;
@@ -767,7 +758,6 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
767
  }
768
 
769
  static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
770
- /* .get_name = */ ggml_backend_cpu_buffer_get_name,
771
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
772
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
773
  /* .init_tensor = */ NULL, // no initialization required
@@ -780,7 +770,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
780
  };
781
 
782
  static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
783
- /* .get_name = */ ggml_backend_cpu_buffer_get_name,
784
  /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
785
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
786
  /* .init_tensor = */ NULL, // no initialization required
@@ -792,6 +781,8 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
792
  /* .reset = */ NULL,
793
  };
794
 
 
 
795
  static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
796
  return "CPU";
797
 
@@ -799,19 +790,14 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
799
  }
800
 
801
  static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
802
- auto alloc_size = size;
803
- if (alloc_size == 0) {
804
- alloc_size = 1;
805
- }
806
-
807
- void * data = ggml_aligned_malloc(alloc_size);
808
 
809
  if (data == NULL) {
810
- GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
811
  return NULL;
812
  }
813
 
814
- return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
815
  }
816
 
817
  static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -843,6 +829,29 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
843
  return &ggml_backend_cpu_buffer_type;
844
  }
845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
846
  #ifdef GGML_USE_CPU_HBM
847
 
848
  // buffer type HBM
@@ -855,18 +864,11 @@ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffe
855
  GGML_UNUSED(buft);
856
  }
857
 
858
- static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
859
- return "CPU_HBM";
860
-
861
- GGML_UNUSED(buf);
862
- }
863
-
864
  static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
865
  hbw_free(buffer->context);
866
  }
867
 
868
  static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
869
- //void * ptr = hbw_malloc(size);
870
  void * ptr;
871
  int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
872
  if (result != 0) {
@@ -876,7 +878,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
876
 
877
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
878
  buffer->buft = buft;
879
- buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
880
  buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
881
 
882
  return buffer;
@@ -899,6 +900,21 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
899
  }
900
  #endif
901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
  struct ggml_backend_cpu_context {
903
  int n_threads;
904
  ggml_threadpool_t threadpool;
@@ -923,12 +939,6 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) {
923
  delete backend;
924
  }
925
 
926
- static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
927
- return ggml_backend_cpu_buffer_type();
928
-
929
- GGML_UNUSED(backend);
930
- }
931
-
932
  struct ggml_backend_plan_cpu {
933
  struct ggml_cplan cplan;
934
  struct ggml_cgraph cgraph;
@@ -998,7 +1008,6 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
998
  static const struct ggml_backend_i ggml_backend_cpu_i = {
999
  /* .get_name = */ ggml_backend_cpu_get_name,
1000
  /* .free = */ ggml_backend_cpu_free,
1001
- /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
1002
  /* .set_tensor_async = */ NULL,
1003
  /* .get_tensor_async = */ NULL,
1004
  /* .cpy_tensor_async = */ NULL,
@@ -1008,9 +1017,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
1008
  /* .graph_plan_update = */ NULL,
1009
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
1010
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
1011
- /* .supports_op = */ NULL,
1012
- /* .supports_buft = */ NULL,
1013
- /* .offload_op = */ NULL,
1014
  /* .event_record = */ NULL,
1015
  /* .event_wait = */ NULL,
1016
  };
@@ -1081,10 +1087,10 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
1081
 
1082
  ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1083
  GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1084
- return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1085
  }
1086
 
1087
- ////////////////////////
1088
 
1089
  struct ggml_backend_cpu_device_context {
1090
  std::string description = "CPU";
@@ -1171,7 +1177,7 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
1171
  }
1172
 
1173
  static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
1174
- return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
1175
 
1176
  GGML_UNUSED(dev);
1177
  }
@@ -1189,7 +1195,7 @@ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggm
1189
  };
1190
  }
1191
 
1192
- static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
1193
  return ggml_backend_cpu_init();
1194
 
1195
  GGML_UNUSED(dev);
@@ -1202,7 +1208,7 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_b
1202
  GGML_UNUSED(dev);
1203
  }
1204
 
1205
- static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1206
  return ggml_backend_cpu_buffer_from_ptr(ptr, size);
1207
 
1208
  GGML_UNUSED(dev);
@@ -1244,10 +1250,10 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
1244
  /* .get_memory = */ ggml_backend_cpu_device_get_memory,
1245
  /* .get_type = */ ggml_backend_cpu_device_get_type,
1246
  /* .get_props = */ ggml_backend_cpu_device_get_props,
1247
- /* .init_backend = */ ggml_backend_cpu_device_init,
1248
  /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
1249
  /* .get_host_buffer_type = */ NULL,
1250
- /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
1251
  /* .supports_op = */ ggml_backend_cpu_device_supports_op,
1252
  /* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
1253
  /* .offload_op = */ NULL,
@@ -1256,7 +1262,7 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
1256
  /* .event_synchronize = */ NULL,
1257
  };
1258
 
1259
- ////////////////////////
1260
 
1261
  static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
1262
  return "CPU";
@@ -1287,6 +1293,10 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
1287
  if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
1288
  return (void *)ggml_backend_cpu_set_n_threads;
1289
  }
 
 
 
 
1290
  return NULL;
1291
 
1292
  GGML_UNUSED(reg);
@@ -1315,12 +1325,6 @@ struct ggml_backend_multi_buffer_context {
1315
  size_t n_buffers;
1316
  };
1317
 
1318
- static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
1319
- ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1320
-
1321
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
1322
- }
1323
-
1324
  static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1325
  ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1326
  for (size_t i = 0; i < ctx->n_buffers; i++) {
@@ -1339,7 +1343,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
1339
  }
1340
 
1341
  static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
1342
- /* .get_name = */ ggml_backend_multi_buffer_get_name,
1343
  /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
1344
  /* .get_base = */ NULL,
1345
  /* .init_tensor = */ NULL,
@@ -1368,7 +1371,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
1368
  }
1369
 
1370
  bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
1371
- return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
1372
  }
1373
 
1374
  void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
@@ -1460,7 +1463,7 @@ struct ggml_backend_sched {
1460
  char * context_buffer;
1461
  size_t context_buffer_size;
1462
 
1463
- bool debug;
1464
  };
1465
 
1466
  #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1500,7 +1503,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
1500
  return -1;
1501
  }
1502
 
1503
- #if 0
1504
  #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
1505
  static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1506
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
@@ -1548,7 +1551,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1548
  if (src == NULL) {
1549
  continue;
1550
  }
1551
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
 
 
1552
  int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1553
  // check if a backend with higher prio wants to offload the op
1554
  if (src_backend_id == sched->n_backends - 1) {
@@ -1595,19 +1600,21 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1595
  if (ggml_is_view_op(node->op)) {
1596
  continue;
1597
  }
1598
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1599
- GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1600
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1601
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1602
- struct ggml_tensor * src = node->src[j];
1603
- if (src == NULL) {
1604
- continue;
 
 
 
 
 
1605
  }
1606
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1607
- GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1608
- fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1609
  }
1610
- GGML_LOG_DEBUG("\n");
1611
  }
1612
  }
1613
 
@@ -1899,11 +1906,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1899
  if (src == NULL) {
1900
  continue;
1901
  }
1902
- // check if a weight is on a different backend
1903
  // by starting a new split, the memory of the previously offloaded weights can be reused
1904
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1905
  int src_backend_id = tensor_backend_id(src);
1906
- if (src_backend_id != cur_backend_id) {
1907
  need_new_split = true;
1908
  break;
1909
  }
@@ -1915,7 +1922,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1915
  int src_backend_id = sched->hv_tensor_backend_ids[id];
1916
  bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1917
  if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1918
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1919
  need_new_split = true;
1920
  break;
1921
  }
@@ -2240,7 +2246,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
2240
 
2241
  struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
2242
 
2243
- sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
 
2244
  sched->n_backends = n_backends;
2245
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
2246
 
 
34
  }
35
 
36
  ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
37
+ if (size == 0) {
38
+ // return a dummy buffer for zero-sized allocations
39
+ return ggml_backend_buffer_init(buft, {}, NULL, 0);
40
+ }
41
+
42
  return buft->iface.alloc_buffer(buft, size);
43
  }
44
 
 
94
  }
95
 
96
  const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
97
+ return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
98
  }
99
 
100
  void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
 
113
  }
114
 
115
  void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
116
+ // get_base is optional if the buffer is zero-sized
117
+ if (buffer->size == 0) {
118
+ return NULL;
119
+ }
120
+
121
  void * base = buffer->iface.get_base(buffer);
122
 
123
  GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
 
132
  }
133
  }
134
 
135
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
136
+ // clear is optional if the buffer is zero-sized
137
+ if (buffer->size == 0) {
138
+ return;
139
+ }
140
+
141
+ buffer->iface.clear(buffer, value);
142
+ }
143
+
144
  size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
145
  return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
146
  }
 
153
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
154
  }
155
 
 
 
 
 
156
  bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
157
  return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
158
  }
 
213
  }
214
 
215
  ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
216
+ return ggml_backend_dev_buffer_type(backend->device);
217
  }
218
 
219
  ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
 
253
  void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
254
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
255
 
256
+ if (size == 0) {
257
+ return;
258
+ }
259
+
260
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
261
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
262
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
263
 
 
 
 
 
264
  buf->iface.set_tensor(buf, tensor, data, offset, size);
265
  }
266
 
267
  void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
268
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
269
 
270
+ if (size == 0) {
271
+ return;
272
+ }
273
+
274
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
275
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
276
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
277
 
 
 
 
 
278
  buf->iface.get_tensor(buf, tensor, data, offset, size);
279
  }
280
 
281
  GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
282
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
283
 
284
+ if (size == 0) {
 
 
 
 
285
  return;
286
  }
287
 
288
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
289
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
290
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
291
+ GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
292
 
293
  buf->iface.memset_tensor(buf, tensor, value, offset, size);
294
  }
 
330
  }
331
 
332
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
333
+ return ggml_backend_dev_supports_op(backend->device, op);
 
 
 
 
 
334
  }
335
 
336
  bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
337
+ return ggml_backend_dev_supports_buft(backend->device, buft);
 
 
 
 
338
  }
339
 
340
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
341
+ return ggml_backend_dev_offload_op(backend->device, op);
 
 
 
 
 
 
 
 
342
  }
343
 
344
  ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
 
579
  #ifdef GGML_USE_VULKAN
580
  register_backend(ggml_backend_vk_reg());
581
  #endif
582
+ #ifdef GGML_USE_CANN
583
+ register_backend(ggml_backend_cann_reg());
584
+ #endif
585
  #ifdef GGML_USE_BLAS
586
  register_backend(ggml_backend_blas_reg());
587
  #endif
 
591
  #ifdef GGML_USE_AMX
592
  register_backend(ggml_backend_amx_reg());
593
  #endif
 
 
 
594
 
595
  // TODO: kompute
596
 
 
698
  }
699
 
700
  ggml_backend_t ggml_backend_init_best(void) {
701
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
702
  if (!dev) {
703
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
704
  }
705
  if (!dev) {
706
  return NULL;
 
708
  return ggml_backend_dev_init(dev, NULL);
709
  }
710
 
711
+ // CPU backend - buffer
 
 
 
 
 
 
712
 
713
  static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
714
  uintptr_t data = (uintptr_t)buffer->context;
 
758
  }
759
 
760
  static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
 
761
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
762
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
763
  /* .init_tensor = */ NULL, // no initialization required
 
770
  };
771
 
772
  static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
 
773
  /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
774
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
775
  /* .init_tensor = */ NULL, // no initialization required
 
781
  /* .reset = */ NULL,
782
  };
783
 
784
+ // CPU backend - buffer type
785
+
786
  static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
787
  return "CPU";
788
 
 
790
  }
791
 
792
  static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
793
+ void * data = ggml_aligned_malloc(size);
 
 
 
 
 
794
 
795
  if (data == NULL) {
796
+ GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
797
  return NULL;
798
  }
799
 
800
+ return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
801
  }
802
 
803
  static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
 
829
  return &ggml_backend_cpu_buffer_type;
830
  }
831
 
832
+ static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
833
+ return "CPU_Mapped";
834
+
835
+ GGML_UNUSED(buft);
836
+ }
837
+
838
+ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
839
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
840
+ /* .iface = */ {
841
+ /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
842
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
843
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
844
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
845
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
846
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
847
+ },
848
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
849
+ /* .context = */ NULL,
850
+ };
851
+
852
+ return &ggml_backend_cpu_buffer_type;
853
+ }
854
+
855
  #ifdef GGML_USE_CPU_HBM
856
 
857
  // buffer type HBM
 
864
  GGML_UNUSED(buft);
865
  }
866
 
 
 
 
 
 
 
867
  static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
868
  hbw_free(buffer->context);
869
  }
870
 
871
  static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 
872
  void * ptr;
873
  int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
874
  if (result != 0) {
 
878
 
879
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
880
  buffer->buft = buft;
 
881
  buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
882
 
883
  return buffer;
 
900
  }
901
  #endif
902
 
903
+ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
904
+ static ggml_backend_buffer_type_t bufts[] = {
905
+ #ifdef GGML_USE_CPU_HBM
906
+ ggml_backend_cpu_hbm_buffer_type(),
907
+ #endif
908
+ NULL
909
+ };
910
+
911
+ return bufts;
912
+
913
+ GGML_UNUSED(device);
914
+ }
915
+
916
+ // CPU backend - backend (stream)
917
+
918
  struct ggml_backend_cpu_context {
919
  int n_threads;
920
  ggml_threadpool_t threadpool;
 
939
  delete backend;
940
  }
941
 
 
 
 
 
 
 
942
  struct ggml_backend_plan_cpu {
943
  struct ggml_cplan cplan;
944
  struct ggml_cgraph cgraph;
 
1008
  static const struct ggml_backend_i ggml_backend_cpu_i = {
1009
  /* .get_name = */ ggml_backend_cpu_get_name,
1010
  /* .free = */ ggml_backend_cpu_free,
 
1011
  /* .set_tensor_async = */ NULL,
1012
  /* .get_tensor_async = */ NULL,
1013
  /* .cpy_tensor_async = */ NULL,
 
1017
  /* .graph_plan_update = */ NULL,
1018
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
1019
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
 
 
 
1020
  /* .event_record = */ NULL,
1021
  /* .event_wait = */ NULL,
1022
  };
 
1087
 
1088
  ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1089
  GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1090
+ return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1091
  }
1092
 
1093
+ // CPU backend - device
1094
 
1095
  struct ggml_backend_cpu_device_context {
1096
  std::string description = "CPU";
 
1177
  }
1178
 
1179
  static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
1180
+ return GGML_BACKEND_DEVICE_TYPE_CPU;
1181
 
1182
  GGML_UNUSED(dev);
1183
  }
 
1195
  };
1196
  }
1197
 
1198
+ static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
1199
  return ggml_backend_cpu_init();
1200
 
1201
  GGML_UNUSED(dev);
 
1208
  GGML_UNUSED(dev);
1209
  }
1210
 
1211
+ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1212
  return ggml_backend_cpu_buffer_from_ptr(ptr, size);
1213
 
1214
  GGML_UNUSED(dev);
 
1250
  /* .get_memory = */ ggml_backend_cpu_device_get_memory,
1251
  /* .get_type = */ ggml_backend_cpu_device_get_type,
1252
  /* .get_props = */ ggml_backend_cpu_device_get_props,
1253
+ /* .init_backend = */ ggml_backend_cpu_device_init_backend,
1254
  /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
1255
  /* .get_host_buffer_type = */ NULL,
1256
+ /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
1257
  /* .supports_op = */ ggml_backend_cpu_device_supports_op,
1258
  /* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
1259
  /* .offload_op = */ NULL,
 
1262
  /* .event_synchronize = */ NULL,
1263
  };
1264
 
1265
+ // CPU backend - backend (reg)
1266
 
1267
  static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
1268
  return "CPU";
 
1293
  if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
1294
  return (void *)ggml_backend_cpu_set_n_threads;
1295
  }
1296
+ if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
1297
+ return (void *)ggml_backend_cpu_get_extra_bufts;
1298
+ }
1299
+
1300
  return NULL;
1301
 
1302
  GGML_UNUSED(reg);
 
1325
  size_t n_buffers;
1326
  };
1327
 
 
 
 
 
 
 
1328
  static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1329
  ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1330
  for (size_t i = 0; i < ctx->n_buffers; i++) {
 
1343
  }
1344
 
1345
  static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
 
1346
  /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
1347
  /* .get_base = */ NULL,
1348
  /* .init_tensor = */ NULL,
 
1371
  }
1372
 
1373
  bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
1374
+ return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
1375
  }
1376
 
1377
  void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
 
1463
  char * context_buffer;
1464
  size_t context_buffer_size;
1465
 
1466
+ int debug;
1467
  };
1468
 
1469
  #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
 
1503
  return -1;
1504
  }
1505
 
1506
+ #if 1
1507
  #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
1508
  static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1509
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
 
1551
  if (src == NULL) {
1552
  continue;
1553
  }
1554
+ // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
1555
+ // not an ideal solution
1556
+ if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1557
  int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1558
  // check if a backend with higher prio wants to offload the op
1559
  if (src_backend_id == sched->n_backends - 1) {
 
1600
  if (ggml_is_view_op(node->op)) {
1601
  continue;
1602
  }
1603
+ if (sched->debug > 1) {
1604
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1605
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1606
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1607
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1608
+ struct ggml_tensor * src = node->src[j];
1609
+ if (src == NULL) {
1610
+ continue;
1611
+ }
1612
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1613
+ GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1614
+ fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1615
  }
1616
+ GGML_LOG_DEBUG("\n");
 
 
1617
  }
 
1618
  }
1619
  }
1620
 
 
1906
  if (src == NULL) {
1907
  continue;
1908
  }
1909
+ // check if a weight is on a different and incompatible backend
1910
  // by starting a new split, the memory of the previously offloaded weights can be reused
1911
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1912
  int src_backend_id = tensor_backend_id(src);
1913
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1914
  need_new_split = true;
1915
  break;
1916
  }
 
1922
  int src_backend_id = sched->hv_tensor_backend_ids[id];
1923
  bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1924
  if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
 
1925
  need_new_split = true;
1926
  break;
1927
  }
 
2246
 
2247
  struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
2248
 
2249
+ const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
2250
+ sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
2251
  sched->n_backends = n_backends;
2252
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
2253
 
ggml/src/ggml-blas.cpp CHANGED
@@ -224,12 +224,6 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
224
  delete backend;
225
  }
226
 
227
- static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
228
- return ggml_backend_cpu_buffer_type();
229
-
230
- GGML_UNUSED(backend);
231
- }
232
-
233
  static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
234
  ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
235
 
@@ -265,7 +259,6 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
265
  static struct ggml_backend_i blas_backend_i = {
266
  /* .get_name = */ ggml_backend_blas_get_name,
267
  /* .free = */ ggml_backend_blas_free,
268
- /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
269
  /* .set_tensor_async = */ NULL,
270
  /* .get_tensor_async = */ NULL,
271
  /* .cpy_tensor_async = */ NULL,
@@ -275,9 +268,6 @@ static struct ggml_backend_i blas_backend_i = {
275
  /* .graph_plan_update = */ NULL,
276
  /* .graph_plan_compute = */ NULL,
277
  /* .graph_compute = */ ggml_backend_blas_graph_compute,
278
- /* .supports_op = */ NULL,
279
- /* .supports_buft = */ NULL,
280
- /* .offload_op = */ NULL,
281
  /* .event_record = */ NULL,
282
  /* .event_wait = */ NULL,
283
  };
@@ -356,7 +346,7 @@ static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t *
356
  }
357
 
358
  static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
359
- return GGML_BACKEND_DEVICE_TYPE_CPU;
360
 
361
  GGML_UNUSED(dev);
362
  }
@@ -374,7 +364,7 @@ static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct gg
374
  };
375
  }
376
 
377
- static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) {
378
  return ggml_backend_blas_init();
379
 
380
  GGML_UNUSED(dev);
@@ -387,7 +377,7 @@ static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_
387
  GGML_UNUSED(dev);
388
  }
389
 
390
- static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
391
  return ggml_backend_cpu_buffer_from_ptr(ptr, size);
392
 
393
  GGML_UNUSED(dev);
@@ -456,10 +446,10 @@ static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
456
  /* .get_memory = */ ggml_backend_blas_device_get_memory,
457
  /* .get_type = */ ggml_backend_blas_device_get_type,
458
  /* .get_props = */ ggml_backend_blas_device_get_props,
459
- /* .init_backend = */ ggml_backend_blas_device_init,
460
  /* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
461
  /* .get_host_buffer_type = */ NULL,
462
- /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr,
463
  /* .supports_op = */ ggml_backend_blas_device_supports_op,
464
  /* .supports_buft = */ ggml_backend_blas_device_supports_buft,
465
  /* .offload_op = */ NULL,
 
224
  delete backend;
225
  }
226
 
 
 
 
 
 
 
227
  static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
228
  ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
229
 
 
259
  static struct ggml_backend_i blas_backend_i = {
260
  /* .get_name = */ ggml_backend_blas_get_name,
261
  /* .free = */ ggml_backend_blas_free,
 
262
  /* .set_tensor_async = */ NULL,
263
  /* .get_tensor_async = */ NULL,
264
  /* .cpy_tensor_async = */ NULL,
 
268
  /* .graph_plan_update = */ NULL,
269
  /* .graph_plan_compute = */ NULL,
270
  /* .graph_compute = */ ggml_backend_blas_graph_compute,
 
 
 
271
  /* .event_record = */ NULL,
272
  /* .event_wait = */ NULL,
273
  };
 
346
  }
347
 
348
  static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
349
+ return GGML_BACKEND_DEVICE_TYPE_ACCEL;
350
 
351
  GGML_UNUSED(dev);
352
  }
 
364
  };
365
  }
366
 
367
+ static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
368
  return ggml_backend_blas_init();
369
 
370
  GGML_UNUSED(dev);
 
377
  GGML_UNUSED(dev);
378
  }
379
 
380
+ static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
381
  return ggml_backend_cpu_buffer_from_ptr(ptr, size);
382
 
383
  GGML_UNUSED(dev);
 
446
  /* .get_memory = */ ggml_backend_blas_device_get_memory,
447
  /* .get_type = */ ggml_backend_blas_device_get_type,
448
  /* .get_props = */ ggml_backend_blas_device_get_props,
449
+ /* .init_backend = */ ggml_backend_blas_device_init_backend,
450
  /* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
451
  /* .get_host_buffer_type = */ NULL,
452
+ /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
453
  /* .supports_op = */ ggml_backend_blas_device_supports_op,
454
  /* .supports_buft = */ ggml_backend_blas_device_supports_buft,
455
  /* .offload_op = */ NULL,
ggml/src/ggml-cann.cpp CHANGED
@@ -489,23 +489,6 @@ struct ggml_backend_cann_buffer_context {
489
  ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
490
  };
491
 
492
- /**
493
- * @brief Retrieve the name associated with a CANN buffer.
494
- *
495
- * This function returns the name of a CANN buffer, which is stored in the
496
- * context of the buffer.
497
- *
498
- * @param buffer The CANN buffer whose name is to be retrieved.
499
- * @return A pointer to a C-string containing the name of the buffer.
500
- */
501
-
502
- static const char* ggml_backend_cann_buffer_get_name(
503
- ggml_backend_buffer_t buffer) {
504
- return "CANN";
505
-
506
- GGML_UNUSED(buffer);
507
- }
508
-
509
  /**
510
  * @brief Check if a buffer is a CANN buffer.
511
  *
@@ -515,9 +498,10 @@ static const char* ggml_backend_cann_buffer_get_name(
515
  * @param buffer The buffer to check.
516
  * @return true if the buffer is a CANN buffer, false otherwise.
517
  */
 
518
  static bool ggml_backend_buffer_is_cann(
519
  ggml_backend_buffer_t buffer) {
520
- return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
521
  }
522
 
523
  /**
@@ -965,7 +949,6 @@ static void ggml_backend_cann_buffer_clear(
965
  * on a CANN buffer within the backend.
966
  */
967
  static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
968
- /* .get_name = */ ggml_backend_cann_buffer_get_name,
969
  /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
970
  /* .get_base = */ ggml_backend_cann_buffer_get_base,
971
  /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
@@ -999,9 +982,10 @@ struct ggml_backend_cann_buffer_type_context {
999
  */
1000
  static const char* ggml_backend_cann_buffer_type_name(
1001
  ggml_backend_buffer_type_t buft) {
1002
- return "CANN";
 
1003
 
1004
- GGML_UNUSED(buft);
1005
  }
1006
 
1007
  /**
@@ -1465,24 +1449,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
1465
  delete backend;
1466
  }
1467
 
1468
- /**
1469
- * @brief Retrieves the default buffer type associated with the CANN backend.
1470
- *
1471
- * This function returns the buffer type specific to the device associated
1472
- * with the CANN backend. It is used to allocate buffers for computations
1473
- * performed by the backend.
1474
- *
1475
- * @param backend Pointer to the CANN backend structure.
1476
- * @return Pointer to the buffer type structure for the CANN backend.
1477
- */
1478
- static ggml_backend_buffer_type_t
1479
- ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
1480
- ggml_backend_cann_context* cann_ctx =
1481
- (ggml_backend_cann_context*)backend->context;
1482
-
1483
- return ggml_backend_cann_buffer_type(cann_ctx->device);
1484
- }
1485
-
1486
  /**
1487
  * @brief Sets tensor data asynchronously in the CANN backend.
1488
  *
@@ -1863,7 +1829,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
1863
  static const ggml_backend_i ggml_backend_cann_interface = {
1864
  /* .get_name = */ ggml_backend_cann_name,
1865
  /* .free = */ ggml_backend_cann_free,
1866
- /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
1867
  /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
1868
  /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
1869
  /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
@@ -1873,9 +1838,6 @@ static const ggml_backend_i ggml_backend_cann_interface = {
1873
  /* .graph_plan_update = */ NULL,
1874
  /* .graph_plan_compute = */ NULL,
1875
  /* .graph_compute = */ ggml_backend_cann_graph_compute,
1876
- /* .supports_op = */ NULL, // moved to device
1877
- /* .supports_buft = */ NULL, // moved to device
1878
- /* .offload_op = */ NULL, // moved to device
1879
  /* .event_record = */ ggml_backend_cann_event_record,
1880
  /* .event_wait = */ ggml_backend_cann_event_wait,
1881
  };
@@ -1918,7 +1880,7 @@ static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t *
1918
 
1919
  static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
1920
  GGML_UNUSED(dev);
1921
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
1922
  }
1923
 
1924
  static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
 
489
  ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
490
  };
491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  /**
493
  * @brief Check if a buffer is a CANN buffer.
494
  *
 
498
  * @param buffer The buffer to check.
499
  * @return true if the buffer is a CANN buffer, false otherwise.
500
  */
501
+ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
502
  static bool ggml_backend_buffer_is_cann(
503
  ggml_backend_buffer_t buffer) {
504
+ return ggml_backend_buft_is_cann(buffer->buft);
505
  }
506
 
507
  /**
 
949
  * on a CANN buffer within the backend.
950
  */
951
  static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
 
952
  /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
953
  /* .get_base = */ ggml_backend_cann_buffer_get_base,
954
  /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
 
982
  */
983
  static const char* ggml_backend_cann_buffer_type_name(
984
  ggml_backend_buffer_type_t buft) {
985
+ ggml_backend_cann_buffer_type_context* buft_ctx =
986
+ (ggml_backend_cann_buffer_type_context*)buft->context;
987
 
988
+ return buft_ctx->name.c_str();
989
  }
990
 
991
  /**
 
1449
  delete backend;
1450
  }
1451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1452
  /**
1453
  * @brief Sets tensor data asynchronously in the CANN backend.
1454
  *
 
1829
  static const ggml_backend_i ggml_backend_cann_interface = {
1830
  /* .get_name = */ ggml_backend_cann_name,
1831
  /* .free = */ ggml_backend_cann_free,
 
1832
  /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
1833
  /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
1834
  /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
 
1838
  /* .graph_plan_update = */ NULL,
1839
  /* .graph_plan_compute = */ NULL,
1840
  /* .graph_compute = */ ggml_backend_cann_graph_compute,
 
 
 
1841
  /* .event_record = */ ggml_backend_cann_event_record,
1842
  /* .event_wait = */ ggml_backend_cann_event_wait,
1843
  };
 
1880
 
1881
  static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
1882
  GGML_UNUSED(dev);
1883
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
1884
  }
1885
 
1886
  static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
ggml/src/ggml-cuda.cu CHANGED
@@ -421,18 +421,13 @@ struct ggml_backend_cuda_buffer_context {
421
  }
422
  };
423
 
424
- static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
425
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
426
- return ctx->name.c_str();
427
  }
428
 
429
  static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
430
- return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
431
- }
432
-
433
- static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
434
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
435
- delete ctx;
436
  }
437
 
438
  static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -515,7 +510,6 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
515
  }
516
 
517
  static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
518
- /* .get_name = */ ggml_backend_cuda_buffer_get_name,
519
  /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
520
  /* .get_base = */ ggml_backend_cuda_buffer_get_base,
521
  /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
@@ -548,8 +542,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
548
 
549
  ggml_cuda_set_device(buft_ctx->device);
550
 
551
- size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
552
-
553
  void * dev_ptr;
554
  cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
555
  if (err != cudaSuccess) {
@@ -657,7 +649,9 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
657
  }
658
 
659
  struct ggml_backend_cuda_split_buffer_type_context {
 
660
  std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
 
661
  };
662
 
663
  struct ggml_backend_cuda_split_buffer_context {
@@ -680,16 +674,6 @@ struct ggml_backend_cuda_split_buffer_context {
680
  std::vector<ggml_tensor_extra_gpu *> tensor_extras;
681
  };
682
 
683
- static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
684
- return GGML_CUDA_NAME "_Split";
685
-
686
- GGML_UNUSED(buffer);
687
- }
688
-
689
- static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
690
- return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
691
- GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
692
- }
693
 
694
  static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
695
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@@ -833,7 +817,6 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u
833
  }
834
 
835
  static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
836
- /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
837
  /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
838
  /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
839
  /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
@@ -848,9 +831,9 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
848
  // cuda split buffer type
849
 
850
  static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
851
- return GGML_CUDA_NAME "_Split";
852
 
853
- GGML_UNUSED(buft);
854
  }
855
 
856
  static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
@@ -915,11 +898,11 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
915
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
916
  };
917
 
918
- ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
919
  static std::mutex mutex;
920
  std::lock_guard<std::mutex> lock(mutex);
921
 
922
- static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
923
 
924
  std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
925
 
@@ -937,18 +920,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
937
  }
938
  }
939
 
940
- auto it = buft_map.find(tensor_split_arr);
941
  if (it != buft_map.end()) {
942
  return &it->second;
943
  }
 
 
 
 
 
944
 
945
  struct ggml_backend_buffer_type buft {
946
  /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
947
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
948
- /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
949
  };
950
 
951
- auto result = buft_map.emplace(tensor_split_arr, buft);
952
  return &result.first->second;
953
  }
954
 
@@ -960,12 +948,6 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
960
  GGML_UNUSED(buft);
961
  }
962
 
963
- static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
964
- return GGML_CUDA_NAME "_Host";
965
-
966
- GGML_UNUSED(buffer);
967
- }
968
-
969
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
970
  CUDA_CHECK(cudaFreeHost(buffer->context));
971
  }
@@ -998,7 +980,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
998
 
999
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
1000
  buffer->buft = buft;
1001
- buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
1002
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
1003
 
1004
  return buffer;
@@ -1400,7 +1381,7 @@ static void ggml_cuda_op_mul_mat(
1400
 
1401
  const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
1402
 
1403
- const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
1404
  GGML_ASSERT(!(split && ne02 > 1));
1405
  GGML_ASSERT(!(split && ne03 > 1));
1406
  GGML_ASSERT(!(split && ne02 < ne12));
@@ -1890,7 +1871,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1890
  }
1891
 
1892
  static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1893
- const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
1894
 
1895
  bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
1896
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
@@ -2017,7 +1998,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2017
 
2018
  GGML_TENSOR_BINARY_OP_LOCALS
2019
 
2020
- GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
2021
 
2022
  cudaStream_t stream = ctx.stream();
2023
 
@@ -2150,7 +2131,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2150
 
2151
  static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
2152
  // why is this here instead of mul_mat?
2153
- if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
2154
  ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
2155
  }
2156
 
@@ -2371,12 +2352,6 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) {
2371
  delete backend;
2372
  }
2373
 
2374
- static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
2375
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2376
-
2377
- return ggml_backend_cuda_buffer_type(cuda_ctx->device);
2378
- }
2379
-
2380
  static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2381
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2382
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -2582,7 +2557,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2582
  continue;
2583
  }
2584
 
2585
- if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2586
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2587
  #ifndef NDEBUG
2588
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
@@ -2669,7 +2644,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2669
  for (int j = 0; j < GGML_MAX_SRC; j++) {
2670
  if (node->src[j] != nullptr) {
2671
  assert(node->src[j]->buffer);
2672
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
 
2673
  }
2674
  }
2675
  #endif
@@ -2762,7 +2738,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2762
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2763
  if (stat == cudaErrorGraphExecUpdateFailure) {
2764
  #ifndef NDEBUG
2765
- GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2766
  #endif
2767
  // The pre-existing graph exec cannot be updated due to violated constraints
2768
  // so instead clear error and re-instantiate
@@ -2811,7 +2787,6 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
2811
  static const ggml_backend_i ggml_backend_cuda_interface = {
2812
  /* .get_name = */ ggml_backend_cuda_get_name,
2813
  /* .free = */ ggml_backend_cuda_free,
2814
- /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
2815
  /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
2816
  /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
2817
  /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
@@ -2821,9 +2796,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
2821
  /* .graph_plan_update = */ NULL,
2822
  /* .graph_plan_compute = */ NULL,
2823
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2824
- /* .supports_op = */ NULL, // moved to device
2825
- /* .supports_buft = */ NULL, // moved to device
2826
- /* .offload_op = */ NULL, // moved to device
2827
  /* .event_record = */ ggml_backend_cuda_event_record,
2828
  /* .event_wait = */ ggml_backend_cuda_event_wait,
2829
  };
@@ -2913,7 +2885,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
2913
 
2914
  static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
2915
  GGML_UNUSED(dev);
2916
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
2917
  }
2918
 
2919
  static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
@@ -2937,7 +2909,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
2937
  };
2938
  }
2939
 
2940
- static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) {
2941
  GGML_UNUSED(params);
2942
  ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2943
  return ggml_backend_cuda_init(ctx->device);
@@ -2953,18 +2925,29 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
2953
  return ggml_backend_cuda_host_buffer_type();
2954
  }
2955
 
2956
- static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
2957
- GGML_UNUSED(dev);
2958
- GGML_UNUSED(ptr);
2959
- GGML_UNUSED(size);
2960
- GGML_UNUSED(max_tensor_size);
2961
- return nullptr;
2962
- }
2963
-
2964
  // TODO: move these functions here
2965
  static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
2966
  ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
2967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2968
  switch (op->op) {
2969
  case GGML_OP_UNARY:
2970
  switch (ggml_get_unary_op(op)) {
@@ -3190,24 +3173,27 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3190
  }
3191
 
3192
  static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3193
- if (ggml_backend_buft_is_cuda_split(buft)) {
3194
- return true;
3195
- }
3196
 
3197
- if (ggml_backend_buft_is_cuda(buft)) {
3198
- ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
3199
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
3200
- return buft_ctx->device == dev_ctx->device;
 
 
 
 
 
 
 
3201
  }
3202
-
3203
- return false;
3204
  }
3205
 
3206
  static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
3207
  const int min_batch_size = 32;
3208
 
3209
- return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
3210
- (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
3211
 
3212
  GGML_UNUSED(dev);
3213
  }
@@ -3248,10 +3234,10 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
3248
  /* .get_memory = */ ggml_backend_cuda_device_get_memory,
3249
  /* .get_type = */ ggml_backend_cuda_device_get_type,
3250
  /* .get_props = */ ggml_backend_cuda_device_get_props,
3251
- /* .init_backend = */ ggml_backend_cuda_device_init,
3252
  /* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
3253
  /* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
3254
- /* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr,
3255
  /* .supports_op = */ ggml_backend_cuda_device_supports_op,
3256
  /* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
3257
  /* .offload_op = */ ggml_backend_cuda_device_offload_op,
 
421
  }
422
  };
423
 
424
+ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
425
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
426
+ delete ctx;
427
  }
428
 
429
  static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
430
+ return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
 
 
 
 
 
431
  }
432
 
433
  static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
 
510
  }
511
 
512
  static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
 
513
  /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
514
  /* .get_base = */ ggml_backend_cuda_buffer_get_base,
515
  /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
 
542
 
543
  ggml_cuda_set_device(buft_ctx->device);
544
 
 
 
545
  void * dev_ptr;
546
  cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
547
  if (err != cudaSuccess) {
 
649
  }
650
 
651
  struct ggml_backend_cuda_split_buffer_type_context {
652
+ int main_device;
653
  std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
654
+ std::string name;
655
  };
656
 
657
  struct ggml_backend_cuda_split_buffer_context {
 
674
  std::vector<ggml_tensor_extra_gpu *> tensor_extras;
675
  };
676
 
 
 
 
 
 
 
 
 
 
 
677
 
678
  static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
679
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
 
817
  }
818
 
819
  static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
 
820
  /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
821
  /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
822
  /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
 
831
  // cuda split buffer type
832
 
833
  static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
834
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
835
 
836
+ return ctx->name.c_str();
837
  }
838
 
839
  static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
 
898
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
899
  };
900
 
901
+ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
902
  static std::mutex mutex;
903
  std::lock_guard<std::mutex> lock(mutex);
904
 
905
+ static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
906
 
907
  std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
908
 
 
920
  }
921
  }
922
 
923
+ auto it = buft_map.find({main_device, tensor_split_arr});
924
  if (it != buft_map.end()) {
925
  return &it->second;
926
  }
927
+ auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
928
+ main_device,
929
+ tensor_split_arr,
930
+ GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
931
+ };
932
 
933
  struct ggml_backend_buffer_type buft {
934
  /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
935
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
936
+ /* .context = */ ctx,
937
  };
938
 
939
+ auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
940
  return &result.first->second;
941
  }
942
 
 
948
  GGML_UNUSED(buft);
949
  }
950
 
 
 
 
 
 
 
951
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
952
  CUDA_CHECK(cudaFreeHost(buffer->context));
953
  }
 
980
 
981
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
982
  buffer->buft = buft;
 
983
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
984
 
985
  return buffer;
 
1381
 
1382
  const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
1383
 
1384
+ const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1385
  GGML_ASSERT(!(split && ne02 > 1));
1386
  GGML_ASSERT(!(split && ne03 > 1));
1387
  GGML_ASSERT(!(split && ne02 < ne12));
 
1871
  }
1872
 
1873
  static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1874
+ const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1875
 
1876
  bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
1877
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
 
1998
 
1999
  GGML_TENSOR_BINARY_OP_LOCALS
2000
 
2001
+ GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
2002
 
2003
  cudaStream_t stream = ctx.stream();
2004
 
 
2131
 
2132
  static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
2133
  // why is this here instead of mul_mat?
2134
+ if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
2135
  ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
2136
  }
2137
 
 
2352
  delete backend;
2353
  }
2354
 
 
 
 
 
 
 
2355
  static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2356
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2357
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
2557
  continue;
2558
  }
2559
 
2560
+ if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
2561
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2562
  #ifndef NDEBUG
2563
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
 
2644
  for (int j = 0; j < GGML_MAX_SRC; j++) {
2645
  if (node->src[j] != nullptr) {
2646
  assert(node->src[j]->buffer);
2647
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
2648
+ ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
2649
  }
2650
  }
2651
  #endif
 
2738
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2739
  if (stat == cudaErrorGraphExecUpdateFailure) {
2740
  #ifndef NDEBUG
2741
+ GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
2742
  #endif
2743
  // The pre-existing graph exec cannot be updated due to violated constraints
2744
  // so instead clear error and re-instantiate
 
2787
  static const ggml_backend_i ggml_backend_cuda_interface = {
2788
  /* .get_name = */ ggml_backend_cuda_get_name,
2789
  /* .free = */ ggml_backend_cuda_free,
 
2790
  /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
2791
  /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
2792
  /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
 
2796
  /* .graph_plan_update = */ NULL,
2797
  /* .graph_plan_compute = */ NULL,
2798
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
 
 
 
2799
  /* .event_record = */ ggml_backend_cuda_event_record,
2800
  /* .event_wait = */ ggml_backend_cuda_event_wait,
2801
  };
 
2885
 
2886
  static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
2887
  GGML_UNUSED(dev);
2888
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
2889
  }
2890
 
2891
  static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
 
2909
  };
2910
  }
2911
 
2912
+ static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
2913
  GGML_UNUSED(params);
2914
  ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
2915
  return ggml_backend_cuda_init(ctx->device);
 
2925
  return ggml_backend_cuda_host_buffer_type();
2926
  }
2927
 
 
 
 
 
 
 
 
 
2928
  // TODO: move these functions here
2929
  static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
2930
  ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
2931
 
2932
+ // split buffers can only be used with GGML_OP_MUL_MAT
2933
+ if (op->op != GGML_OP_MUL_MAT) {
2934
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2935
+ if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
2936
+ return false;
2937
+ }
2938
+ }
2939
+ }
2940
+
2941
+ // check if all the sources are allocated on this device
2942
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2943
+ if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
2944
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
2945
+ if (buft_ctx->device != dev_ctx->device) {
2946
+ return false;
2947
+ }
2948
+ }
2949
+ }
2950
+
2951
  switch (op->op) {
2952
  case GGML_OP_UNARY:
2953
  switch (ggml_get_unary_op(op)) {
 
3173
  }
3174
 
3175
  static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3176
+ return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
3177
+ }
 
3178
 
3179
+ static int64_t get_op_batch_size(const ggml_tensor * op) {
3180
+ switch (op->op) {
3181
+ case GGML_OP_GET_ROWS:
3182
+ return 0;
3183
+ case GGML_OP_MUL_MAT:
3184
+ return op->ne[1];
3185
+ case GGML_OP_MUL_MAT_ID:
3186
+ case GGML_OP_ROPE:
3187
+ return op->ne[2];
3188
+ default:
3189
+ return ggml_nrows(op);
3190
  }
 
 
3191
  }
3192
 
3193
  static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
3194
  const int min_batch_size = 32;
3195
 
3196
+ return get_op_batch_size(op) >= min_batch_size;
 
3197
 
3198
  GGML_UNUSED(dev);
3199
  }
 
3234
  /* .get_memory = */ ggml_backend_cuda_device_get_memory,
3235
  /* .get_type = */ ggml_backend_cuda_device_get_type,
3236
  /* .get_props = */ ggml_backend_cuda_device_get_props,
3237
+ /* .init_backend = */ ggml_backend_cuda_device_init_backend,
3238
  /* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
3239
  /* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
3240
+ /* .buffer_from_host_ptr = */ NULL,
3241
  /* .supports_op = */ ggml_backend_cuda_device_supports_op,
3242
  /* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
3243
  /* .offload_op = */ ggml_backend_cuda_device_offload_op,
ggml/src/ggml-kompute.cpp CHANGED
@@ -1820,11 +1820,6 @@ static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
1820
  }
1821
  }
1822
 
1823
- static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
1824
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
1825
- return ctx->name.c_str();
1826
- }
1827
-
1828
  static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1829
  auto * memory = (ggml_vk_memory *)buffer->context;
1830
  if (ggml_vk_has_device()) {
@@ -1868,7 +1863,6 @@ static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint
1868
  }
1869
 
1870
  static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
1871
- /* .get_name = */ ggml_backend_kompute_buffer_get_name,
1872
  /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
1873
  /* .get_base = */ ggml_backend_kompute_buffer_get_base,
1874
  /* .init_tensor = */ NULL,
@@ -1953,11 +1947,6 @@ static void ggml_backend_kompute_free(ggml_backend_t backend) {
1953
  delete backend;
1954
  }
1955
 
1956
- static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
1957
- auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
1958
- return ggml_backend_kompute_buffer_type(ctx->device);
1959
- }
1960
-
1961
  static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
1962
  auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
1963
  ggml_vk_graph_compute(ctx, cgraph);
@@ -1977,7 +1966,6 @@ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_back
1977
  static struct ggml_backend_i kompute_backend_i = {
1978
  /* .get_name = */ ggml_backend_kompute_name,
1979
  /* .free = */ ggml_backend_kompute_free,
1980
- /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
1981
  /* .set_tensor_async = */ NULL,
1982
  /* .get_tensor_async = */ NULL,
1983
  /* .cpy_tensor_async = */ NULL,
@@ -1987,9 +1975,6 @@ static struct ggml_backend_i kompute_backend_i = {
1987
  /* .graph_plan_update = */ NULL,
1988
  /* .graph_plan_compute = */ NULL,
1989
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1990
- /* .supports_op = */ ggml_backend_kompute_supports_op,
1991
- /* .supports_buft = */ ggml_backend_kompute_supports_buft,
1992
- /* .offload_op = */ NULL,
1993
  /* .event_record = */ NULL,
1994
  /* .event_wait = */ NULL,
1995
  };
 
1820
  }
1821
  }
1822
 
 
 
 
 
 
1823
  static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1824
  auto * memory = (ggml_vk_memory *)buffer->context;
1825
  if (ggml_vk_has_device()) {
 
1863
  }
1864
 
1865
  static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
 
1866
  /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
1867
  /* .get_base = */ ggml_backend_kompute_buffer_get_base,
1868
  /* .init_tensor = */ NULL,
 
1947
  delete backend;
1948
  }
1949
 
 
 
 
 
 
1950
  static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
1951
  auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
1952
  ggml_vk_graph_compute(ctx, cgraph);
 
1966
  static struct ggml_backend_i kompute_backend_i = {
1967
  /* .get_name = */ ggml_backend_kompute_name,
1968
  /* .free = */ ggml_backend_kompute_free,
 
1969
  /* .set_tensor_async = */ NULL,
1970
  /* .get_tensor_async = */ NULL,
1971
  /* .cpy_tensor_async = */ NULL,
 
1975
  /* .graph_plan_update = */ NULL,
1976
  /* .graph_plan_compute = */ NULL,
1977
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
 
 
 
1978
  /* .event_record = */ NULL,
1979
  /* .event_wait = */ NULL,
1980
  };
ggml/src/ggml-metal.m CHANGED
@@ -3254,12 +3254,6 @@ static enum ggml_status ggml_metal_graph_compute(
3254
 
3255
  // backend interface
3256
 
3257
- static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
3258
- return "Metal";
3259
-
3260
- UNUSED(buffer);
3261
- }
3262
-
3263
  static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
3264
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
3265
 
@@ -3314,7 +3308,6 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
3314
  }
3315
 
3316
  static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
3317
- /* .get_name = */ ggml_backend_metal_buffer_get_name,
3318
  /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
3319
  /* .get_base = */ ggml_backend_metal_buffer_get_base,
3320
  /* .init_tensor = */ NULL,
@@ -3439,6 +3432,29 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
3439
  return &ggml_backend_buffer_type_metal;
3440
  }
3441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3442
  // TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
3443
  ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
3444
  struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
@@ -3515,7 +3531,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
3515
  }
3516
  }
3517
 
3518
- return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
3519
  }
3520
 
3521
  // backend
@@ -3536,12 +3552,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
3536
  free(backend);
3537
  }
3538
 
3539
- static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
3540
- return ggml_backend_metal_buffer_type();
3541
-
3542
- UNUSED(backend);
3543
- }
3544
-
3545
  static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
3546
  return ggml_metal_graph_compute(backend, cgraph);
3547
  }
@@ -3608,7 +3618,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
3608
  static struct ggml_backend_i ggml_backend_metal_i = {
3609
  /* .get_name = */ ggml_backend_metal_name,
3610
  /* .free = */ ggml_backend_metal_free,
3611
- /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
3612
  /* .set_tensor_async = */ NULL,
3613
  /* .get_tensor_async = */ NULL,
3614
  /* .cpy_tensor_async = */ NULL,
@@ -3618,9 +3627,6 @@ static struct ggml_backend_i ggml_backend_metal_i = {
3618
  /* .graph_plan_update = */ NULL,
3619
  /* .graph_plan_compute = */ NULL,
3620
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
3621
- /* .supports_op = */ NULL,
3622
- /* .supports_buft = */ NULL,
3623
- /* .offload_op = */ NULL,
3624
  /* .event_record = */ NULL,
3625
  /* .event_wait = */ NULL,
3626
  };
@@ -3715,7 +3721,7 @@ static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t
3715
  }
3716
 
3717
  static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
3718
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
3719
 
3720
  GGML_UNUSED(dev);
3721
  }
 
3254
 
3255
  // backend interface
3256
 
 
 
 
 
 
 
3257
  static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
3258
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
3259
 
 
3308
  }
3309
 
3310
  static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
 
3311
  /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
3312
  /* .get_base = */ ggml_backend_metal_buffer_get_base,
3313
  /* .init_tensor = */ NULL,
 
3432
  return &ggml_backend_buffer_type_metal;
3433
  }
3434
 
3435
+ static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
3436
+ return "Metal_Mapped";
3437
+
3438
+ UNUSED(buft);
3439
+ }
3440
+
3441
+ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
3442
+ static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
3443
+ /* .iface = */ {
3444
+ /* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
3445
+ /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
3446
+ /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
3447
+ /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
3448
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
3449
+ /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
3450
+ },
3451
+ /* .device = */ &g_ggml_backend_metal_device,
3452
+ /* .context = */ NULL,
3453
+ };
3454
+
3455
+ return &ggml_backend_buffer_from_ptr_type_metal;
3456
+ }
3457
+
3458
  // TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
3459
  ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
3460
  struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
 
3531
  }
3532
  }
3533
 
3534
+ return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
3535
  }
3536
 
3537
  // backend
 
3552
  free(backend);
3553
  }
3554
 
 
 
 
 
 
 
3555
  static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
3556
  return ggml_metal_graph_compute(backend, cgraph);
3557
  }
 
3618
  static struct ggml_backend_i ggml_backend_metal_i = {
3619
  /* .get_name = */ ggml_backend_metal_name,
3620
  /* .free = */ ggml_backend_metal_free,
 
3621
  /* .set_tensor_async = */ NULL,
3622
  /* .get_tensor_async = */ NULL,
3623
  /* .cpy_tensor_async = */ NULL,
 
3627
  /* .graph_plan_update = */ NULL,
3628
  /* .graph_plan_compute = */ NULL,
3629
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
 
 
 
3630
  /* .event_record = */ NULL,
3631
  /* .event_wait = */ NULL,
3632
  };
 
3721
  }
3722
 
3723
  static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
3724
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
3725
 
3726
  GGML_UNUSED(dev);
3727
  }
ggml/src/ggml-rpc.cpp CHANGED
@@ -178,7 +178,6 @@ struct ggml_backend_rpc_buffer_context {
178
  std::shared_ptr<socket_t> sock;
179
  std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
180
  uint64_t remote_ptr;
181
- std::string name;
182
  };
183
 
184
  // RPC helper functions
@@ -409,11 +408,6 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
409
  return sock;
410
  }
411
 
412
- static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
413
- ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
414
- return ctx->name.c_str();
415
- }
416
-
417
  static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
418
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
419
  rpc_msg_free_buffer_req request = {ctx->remote_ptr};
@@ -524,7 +518,6 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
524
  }
525
 
526
  static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
527
- /* .get_name = */ ggml_backend_rpc_buffer_get_name,
528
  /* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer,
529
  /* .get_base = */ ggml_backend_rpc_buffer_get_base,
530
  /* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor,
@@ -551,7 +544,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
551
  if (response.remote_ptr != 0) {
552
  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
553
  ggml_backend_rpc_buffer_interface,
554
- new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
555
  response.remote_size);
556
  return buffer;
557
  } else {
@@ -609,11 +602,6 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
609
  delete backend;
610
  }
611
 
612
- static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
613
- ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
614
- return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
615
- }
616
-
617
  static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
618
  UNUSED(backend);
619
  // this is no-op because we don't have any async operations
@@ -670,7 +658,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
670
  static ggml_backend_i ggml_backend_rpc_interface = {
671
  /* .get_name = */ ggml_backend_rpc_name,
672
  /* .free = */ ggml_backend_rpc_free,
673
- /* .get_default_buffer_type = */ ggml_backend_rpc_get_default_buffer_type,
674
  /* .set_tensor_async = */ NULL,
675
  /* .get_tensor_async = */ NULL,
676
  /* .cpy_tensor_async = */ NULL,
@@ -680,9 +667,6 @@ static ggml_backend_i ggml_backend_rpc_interface = {
680
  /* .graph_plan_update = */ NULL,
681
  /* .graph_plan_compute = */ NULL,
682
  /* .graph_compute = */ ggml_backend_rpc_graph_compute,
683
- /* .supports_op = */ NULL,
684
- /* .supports_buft = */ NULL,
685
- /* .offload_op = */ NULL,
686
  /* .event_record = */ NULL,
687
  /* .event_wait = */ NULL,
688
  };
@@ -1278,7 +1262,7 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
1278
 
1279
  static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
1280
  // TODO: obtain value from the server
1281
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
1282
 
1283
  UNUSED(dev);
1284
  }
 
178
  std::shared_ptr<socket_t> sock;
179
  std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
180
  uint64_t remote_ptr;
 
181
  };
182
 
183
  // RPC helper functions
 
408
  return sock;
409
  }
410
 
 
 
 
 
 
411
  static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
412
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
413
  rpc_msg_free_buffer_req request = {ctx->remote_ptr};
 
518
  }
519
 
520
  static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
 
521
  /* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer,
522
  /* .get_base = */ ggml_backend_rpc_buffer_get_base,
523
  /* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor,
 
544
  if (response.remote_ptr != 0) {
545
  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
546
  ggml_backend_rpc_buffer_interface,
547
+ new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
548
  response.remote_size);
549
  return buffer;
550
  } else {
 
602
  delete backend;
603
  }
604
 
 
 
 
 
 
605
  static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
606
  UNUSED(backend);
607
  // this is no-op because we don't have any async operations
 
658
  static ggml_backend_i ggml_backend_rpc_interface = {
659
  /* .get_name = */ ggml_backend_rpc_name,
660
  /* .free = */ ggml_backend_rpc_free,
 
661
  /* .set_tensor_async = */ NULL,
662
  /* .get_tensor_async = */ NULL,
663
  /* .cpy_tensor_async = */ NULL,
 
667
  /* .graph_plan_update = */ NULL,
668
  /* .graph_plan_compute = */ NULL,
669
  /* .graph_compute = */ ggml_backend_rpc_graph_compute,
 
 
 
670
  /* .event_record = */ NULL,
671
  /* .event_wait = */ NULL,
672
  };
 
1262
 
1263
  static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
1264
  // TODO: obtain value from the server
1265
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
1266
 
1267
  UNUSED(dev);
1268
  }
ggml/src/ggml-sycl.cpp CHANGED
@@ -249,13 +249,10 @@ struct ggml_backend_sycl_buffer_context {
249
  }
250
  };
251
 
252
- static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
253
- ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
254
- return ctx->name.c_str();
255
- }
256
 
257
  static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
258
- return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
259
  }
260
 
261
  static void
@@ -440,7 +437,6 @@ catch (sycl::exception const &exc) {
440
  }
441
 
442
  static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
443
- /* .get_name = */ ggml_backend_sycl_buffer_get_name,
444
  /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
445
  /* .get_base = */ ggml_backend_sycl_buffer_get_base,
446
  /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor,
@@ -698,16 +694,6 @@ struct ggml_backend_sycl_split_buffer_context {
698
  std::vector<queue_ptr> streams;
699
  };
700
 
701
- static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
702
- return GGML_SYCL_NAME "_Split";
703
-
704
- GGML_UNUSED(buffer);
705
- }
706
-
707
- static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
708
- return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
709
- }
710
-
711
  static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
712
  ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
713
  delete ctx;
@@ -915,7 +901,6 @@ static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, u
915
  }
916
 
917
  static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
918
- /* .get_name = */ ggml_backend_sycl_split_buffer_get_name,
919
  /* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer,
920
  /* .get_base = */ ggml_backend_sycl_split_buffer_get_base,
921
  /* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor,
@@ -935,6 +920,10 @@ static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_bu
935
  GGML_UNUSED(buft);
936
  }
937
 
 
 
 
 
938
  static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
939
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
940
  // instead, we allocate them for each tensor separately in init_tensor
@@ -1040,12 +1029,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
1040
  GGML_UNUSED(buft);
1041
  }
1042
 
1043
- static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
1044
- return GGML_SYCL_NAME "_Host";
1045
-
1046
- GGML_UNUSED(buffer);
1047
- }
1048
-
1049
  static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1050
  ggml_sycl_host_free(buffer->context);
1051
  }
@@ -1061,7 +1044,6 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
1061
  // FIXME: this is a hack to avoid having to implement a new buffer type
1062
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
1063
  buffer->buft = buft;
1064
- buffer->iface.get_name = ggml_backend_sycl_host_buffer_name;
1065
  buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
1066
 
1067
  return buffer;
@@ -4889,12 +4871,6 @@ static void ggml_backend_sycl_free(ggml_backend_t backend) {
4889
  delete backend;
4890
  }
4891
 
4892
-
4893
- static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
4894
- ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4895
- return ggml_backend_sycl_buffer_type(sycl_ctx->device);
4896
- }
4897
-
4898
  static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
4899
  ggml_tensor *tensor,
4900
  const void *data, size_t offset,
@@ -5031,7 +5007,6 @@ static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_ev
5031
  static ggml_backend_i ggml_backend_sycl_interface = {
5032
  /* .get_name = */ ggml_backend_sycl_get_name,
5033
  /* .free = */ ggml_backend_sycl_free,
5034
- /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
5035
  /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
5036
  /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
5037
  /* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
@@ -5043,9 +5018,6 @@ static ggml_backend_i ggml_backend_sycl_interface = {
5043
  /* .graph_plan_update = */ NULL,
5044
  /* .graph_plan_compute = */ NULL,
5045
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
5046
- /* .supports_op = */ NULL, // moved to device
5047
- /* .supports_buft = */ NULL, // moved to device
5048
- /* .offload_op = */ NULL, // moved to device
5049
  /* .event_record = */ ggml_backend_sycl_event_record,
5050
  /* .event_wait = */ ggml_backend_sycl_event_wait,
5051
  };
@@ -5092,7 +5064,7 @@ static void ggml_backend_sycl_device_get_memory(ggml_backend_dev_t dev, size_t *
5092
 
5093
  static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
5094
  GGML_UNUSED(dev);
5095
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
5096
  }
5097
 
5098
  static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
@@ -5388,12 +5360,14 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
5388
  return ctx->devices[index];
5389
  }
5390
 
5391
- static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name)
5392
- {
5393
  GGML_UNUSED(reg);
5394
- if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
5395
- return (void *)ggml_backend_sycl_split_buffer_type;
5396
- }
 
 
 
5397
  // SYCL doesn't support registering host memory, left here for reference
5398
  // "ggml_backend_register_host_buffer"
5399
  // "ggml_backend_unregister_host_buffer"
 
249
  }
250
  };
251
 
252
+ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft);
 
 
 
253
 
254
  static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
255
+ return buffer->buft->iface.get_name == ggml_backend_sycl_buffer_type_get_name;
256
  }
257
 
258
  static void
 
437
  }
438
 
439
  static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
 
440
  /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
441
  /* .get_base = */ ggml_backend_sycl_buffer_get_base,
442
  /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor,
 
694
  std::vector<queue_ptr> streams;
695
  };
696
 
 
 
 
 
 
 
 
 
 
 
697
  static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
698
  ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
699
  delete ctx;
 
901
  }
902
 
903
  static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
 
904
  /* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer,
905
  /* .get_base = */ ggml_backend_sycl_split_buffer_get_base,
906
  /* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor,
 
920
  GGML_UNUSED(buft);
921
  }
922
 
923
+ static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
924
+ return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name;
925
+ }
926
+
927
  static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
928
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
929
  // instead, we allocate them for each tensor separately in init_tensor
 
1029
  GGML_UNUSED(buft);
1030
  }
1031
 
 
 
 
 
 
 
1032
  static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1033
  ggml_sycl_host_free(buffer->context);
1034
  }
 
1044
  // FIXME: this is a hack to avoid having to implement a new buffer type
1045
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
1046
  buffer->buft = buft;
 
1047
  buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
1048
 
1049
  return buffer;
 
4871
  delete backend;
4872
  }
4873
 
 
 
 
 
 
 
4874
  static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
4875
  ggml_tensor *tensor,
4876
  const void *data, size_t offset,
 
5007
  static ggml_backend_i ggml_backend_sycl_interface = {
5008
  /* .get_name = */ ggml_backend_sycl_get_name,
5009
  /* .free = */ ggml_backend_sycl_free,
 
5010
  /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
5011
  /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
5012
  /* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
 
5018
  /* .graph_plan_update = */ NULL,
5019
  /* .graph_plan_compute = */ NULL,
5020
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
 
 
 
5021
  /* .event_record = */ ggml_backend_sycl_event_record,
5022
  /* .event_wait = */ ggml_backend_sycl_event_wait,
5023
  };
 
5064
 
5065
  static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
5066
  GGML_UNUSED(dev);
5067
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
5068
  }
5069
 
5070
  static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
 
5360
  return ctx->devices[index];
5361
  }
5362
 
5363
+ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
 
5364
  GGML_UNUSED(reg);
5365
+
5366
+ // TODO: update to the current function signature
5367
+ //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
5368
+ // return (void *)ggml_backend_sycl_split_buffer_type;
5369
+ //}
5370
+
5371
  // SYCL doesn't support registering host memory, left here for reference
5372
  // "ggml_backend_register_host_buffer"
5373
  // "ggml_backend_unregister_host_buffer"
ggml/src/ggml-vulkan.cpp CHANGED
@@ -6247,13 +6247,8 @@ static void ggml_vk_get_device_description(int device, char * description, size_
6247
 
6248
  // device backend
6249
 
6250
- static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
6251
- ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6252
- return ctx->name.c_str();
6253
- }
6254
-
6255
  static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
6256
- return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
6257
  }
6258
 
6259
  static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -6317,7 +6312,6 @@ static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t v
6317
  }
6318
 
6319
  static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
6320
- /* .get_name = */ ggml_backend_vk_buffer_get_name,
6321
  /* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
6322
  /* .get_base = */ ggml_backend_vk_buffer_get_base,
6323
  /* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
@@ -6413,7 +6407,6 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
6413
 
6414
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
6415
  buffer->buft = buft;
6416
- buffer->iface.get_name = ggml_backend_vk_host_buffer_name;
6417
  buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
6418
 
6419
  return buffer;
@@ -6646,7 +6639,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
6646
  static ggml_backend_i ggml_backend_vk_interface = {
6647
  /* .get_name = */ ggml_backend_vk_name,
6648
  /* .free = */ ggml_backend_vk_free,
6649
- /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
6650
  /* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
6651
  /* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
6652
  /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
@@ -6656,9 +6648,6 @@ static ggml_backend_i ggml_backend_vk_interface = {
6656
  /* .graph_plan_update = */ NULL,
6657
  /* .graph_plan_compute = */ NULL,
6658
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
6659
- /* .supports_op = */ NULL,
6660
- /* .supports_buft = */ NULL,
6661
- /* .offload_op = */ NULL,
6662
  /* .event_record = */ NULL,
6663
  /* .event_wait = */ NULL,
6664
  };
@@ -6717,7 +6706,7 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
6717
  //////////////////////////
6718
 
6719
  struct ggml_backend_vk_device_context {
6720
- int device;
6721
  std::string name;
6722
  std::string description;
6723
  };
@@ -6749,7 +6738,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
6749
 
6750
  static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
6751
  UNUSED(dev);
6752
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
6753
  }
6754
 
6755
  static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
@@ -6758,9 +6747,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
6758
  props->type = ggml_backend_vk_device_get_type(dev);
6759
  ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
6760
  props->caps = {
6761
- /* async */ false,
6762
- /* host_buffer */ true,
6763
- /* events */ false,
 
6764
  };
6765
  }
6766
 
@@ -6949,7 +6939,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
6949
  static std::mutex mutex;
6950
  std::lock_guard<std::mutex> lock(mutex);
6951
  if (!initialized) {
6952
- for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
6953
  ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
6954
  char desc[256];
6955
  ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
 
6247
 
6248
  // device backend
6249
 
 
 
 
 
 
6250
  static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
6251
+ return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name;
6252
  }
6253
 
6254
  static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 
6312
  }
6313
 
6314
  static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
 
6315
  /* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
6316
  /* .get_base = */ ggml_backend_vk_buffer_get_base,
6317
  /* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
 
6407
 
6408
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
6409
  buffer->buft = buft;
 
6410
  buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
6411
 
6412
  return buffer;
 
6639
  static ggml_backend_i ggml_backend_vk_interface = {
6640
  /* .get_name = */ ggml_backend_vk_name,
6641
  /* .free = */ ggml_backend_vk_free,
 
6642
  /* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
6643
  /* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
6644
  /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
 
6648
  /* .graph_plan_update = */ NULL,
6649
  /* .graph_plan_compute = */ NULL,
6650
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
 
 
 
6651
  /* .event_record = */ NULL,
6652
  /* .event_wait = */ NULL,
6653
  };
 
6706
  //////////////////////////
6707
 
6708
  struct ggml_backend_vk_device_context {
6709
+ size_t device;
6710
  std::string name;
6711
  std::string description;
6712
  };
 
6738
 
6739
  static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
6740
  UNUSED(dev);
6741
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
6742
  }
6743
 
6744
  static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
 
6747
  props->type = ggml_backend_vk_device_get_type(dev);
6748
  ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
6749
  props->caps = {
6750
+ /* .async = */ false,
6751
+ /* .host_buffer = */ true,
6752
+ /* .buffer_from_host_ptr = */ false,
6753
+ /* .events = */ false,
6754
  };
6755
  }
6756
 
 
6939
  static std::mutex mutex;
6940
  std::lock_guard<std::mutex> lock(mutex);
6941
  if (!initialized) {
6942
+ for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
6943
  ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
6944
  char desc[256];
6945
  ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
ggml/src/ggml.c CHANGED
@@ -3999,7 +3999,9 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
3999
  if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4000
  GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4001
  __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4002
- assert(false);
 
 
4003
  return NULL;
4004
  }
4005
 
 
3999
  if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4000
  GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4001
  __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4002
+ #ifndef NDEBUG
4003
+ GGML_ABORT("not enough space in the context's memory pool");
4004
+ #endif
4005
  return NULL;
4006
  }
4007