Spaces:
Running
Running
Diego Devesa
commited on
Commit
·
582a21e
1
Parent(s):
474cc59
llama : refactor model loader with backend registry (llama/10026)
Browse files- ggml/include/ggml-backend.h +12 -7
- ggml/include/ggml-cuda.h +1 -1
- ggml/src/ggml-amx.cpp +8 -25
- ggml/src/ggml-backend-impl.h +4 -15
- ggml/src/ggml-backend.cpp +121 -114
- ggml/src/ggml-blas.cpp +5 -15
- ggml/src/ggml-cann.cpp +6 -44
- ggml/src/ggml-cuda.cu +63 -77
- ggml/src/ggml-kompute.cpp +0 -15
- ggml/src/ggml-metal.m +25 -19
- ggml/src/ggml-rpc.cpp +2 -18
- ggml/src/ggml-sycl.cpp +14 -40
- ggml/src/ggml-vulkan.cpp +8 -18
- ggml/src/ggml.c +3 -1
ggml/include/ggml-backend.h
CHANGED
|
@@ -114,11 +114,12 @@ extern "C" {
|
|
| 114 |
//
|
| 115 |
|
| 116 |
enum ggml_backend_dev_type {
|
|
|
|
| 117 |
GGML_BACKEND_DEVICE_TYPE_CPU,
|
|
|
|
| 118 |
GGML_BACKEND_DEVICE_TYPE_GPU,
|
| 119 |
-
// devices
|
| 120 |
-
|
| 121 |
-
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
|
| 122 |
};
|
| 123 |
|
| 124 |
// functionality supported by the device
|
|
@@ -167,10 +168,14 @@ extern "C" {
|
|
| 167 |
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
| 168 |
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
| 169 |
|
|
|
|
| 170 |
|
| 171 |
-
//
|
| 172 |
-
typedef ggml_backend_buffer_type_t
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
//
|
| 176 |
// Backend registry
|
|
@@ -192,7 +197,7 @@ extern "C" {
|
|
| 192 |
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
| 193 |
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
| 194 |
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
| 195 |
-
// = ggml_backend_dev_init(ggml_backend_dev_by_type(
|
| 196 |
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
| 197 |
|
| 198 |
//
|
|
|
|
| 114 |
//
|
| 115 |
|
| 116 |
enum ggml_backend_dev_type {
|
| 117 |
+
// CPU device using system memory
|
| 118 |
GGML_BACKEND_DEVICE_TYPE_CPU,
|
| 119 |
+
// GPU device using dedicated memory
|
| 120 |
GGML_BACKEND_DEVICE_TYPE_GPU,
|
| 121 |
+
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
| 122 |
+
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
|
|
|
| 123 |
};
|
| 124 |
|
| 125 |
// functionality supported by the device
|
|
|
|
| 168 |
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
| 169 |
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
| 170 |
|
| 171 |
+
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
| 172 |
|
| 173 |
+
// Split buffer type for tensor parallelism
|
| 174 |
+
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
| 175 |
+
// Set the number of threads for the backend
|
| 176 |
+
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
| 177 |
+
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
| 178 |
+
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
| 179 |
|
| 180 |
//
|
| 181 |
// Backend registry
|
|
|
|
| 197 |
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
| 198 |
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
| 199 |
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
| 200 |
+
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
| 201 |
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
| 202 |
|
| 203 |
//
|
ggml/include/ggml-cuda.h
CHANGED
|
@@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
|
| 28 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
| 29 |
|
| 30 |
// split tensor buffer that splits matrices by rows across multiple devices
|
| 31 |
-
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
| 32 |
|
| 33 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 34 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
|
|
|
| 28 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
| 29 |
|
| 30 |
// split tensor buffer that splits matrices by rows across multiple devices
|
| 31 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
| 32 |
|
| 33 |
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 34 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
ggml/src/ggml-amx.cpp
CHANGED
|
@@ -16,12 +16,6 @@
|
|
| 16 |
#if defined(__AMX_INT8__)
|
| 17 |
|
| 18 |
// AMX buffer interface
|
| 19 |
-
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 20 |
-
return "AMX";
|
| 21 |
-
|
| 22 |
-
GGML_UNUSED(buffer);
|
| 23 |
-
}
|
| 24 |
-
|
| 25 |
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 26 |
free(buffer->context);
|
| 27 |
}
|
|
@@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|
| 72 |
}
|
| 73 |
|
| 74 |
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
| 75 |
-
/* .get_name = */ ggml_backend_amx_buffer_get_name,
|
| 76 |
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
| 77 |
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
| 78 |
/* .init_tensor = */ NULL, // no initialization required
|
|
@@ -121,14 +114,14 @@ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft
|
|
| 121 |
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
| 122 |
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
| 123 |
/* .iface = */ {
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
},
|
| 131 |
-
/* .device = */
|
| 132 |
/* .context = */ NULL,
|
| 133 |
};
|
| 134 |
|
|
@@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
|
|
| 149 |
delete backend;
|
| 150 |
}
|
| 151 |
|
| 152 |
-
static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
|
| 153 |
-
return ggml_backend_amx_buffer_type();
|
| 154 |
-
|
| 155 |
-
GGML_UNUSED(backend);
|
| 156 |
-
}
|
| 157 |
-
|
| 158 |
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 159 |
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
|
| 160 |
|
|
@@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
|
|
| 187 |
static struct ggml_backend_i ggml_backend_amx_i = {
|
| 188 |
/* .get_name = */ ggml_backend_amx_name,
|
| 189 |
/* .free = */ ggml_backend_amx_free,
|
| 190 |
-
/* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
|
| 191 |
/* .set_tensor_async = */ NULL,
|
| 192 |
/* .get_tensor_async = */ NULL,
|
| 193 |
/* .cpy_tensor_async = */ NULL,
|
|
@@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
|
|
| 197 |
/* .graph_plan_update = */ NULL,
|
| 198 |
/* .graph_plan_compute = */ NULL,
|
| 199 |
/* .graph_compute = */ ggml_backend_amx_graph_compute,
|
| 200 |
-
/* .supports_op = */ NULL,
|
| 201 |
-
/* .supports_buft = */ NULL,
|
| 202 |
-
/* .offload_op = */ NULL,
|
| 203 |
/* .event_record = */ NULL,
|
| 204 |
/* .event_wait = */ NULL,
|
| 205 |
};
|
|
@@ -279,7 +262,7 @@ static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|
| 279 |
}
|
| 280 |
|
| 281 |
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
|
| 282 |
-
return
|
| 283 |
|
| 284 |
GGML_UNUSED(dev);
|
| 285 |
}
|
|
|
|
| 16 |
#if defined(__AMX_INT8__)
|
| 17 |
|
| 18 |
// AMX buffer interface
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 20 |
free(buffer->context);
|
| 21 |
}
|
|
|
|
| 66 |
}
|
| 67 |
|
| 68 |
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
|
|
|
| 69 |
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
| 70 |
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
| 71 |
/* .init_tensor = */ NULL, // no initialization required
|
|
|
|
| 114 |
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
| 115 |
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
| 116 |
/* .iface = */ {
|
| 117 |
+
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
| 118 |
+
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
| 119 |
+
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
| 120 |
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 121 |
+
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
| 122 |
+
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
|
| 123 |
},
|
| 124 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
|
| 125 |
/* .context = */ NULL,
|
| 126 |
};
|
| 127 |
|
|
|
|
| 142 |
delete backend;
|
| 143 |
}
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 146 |
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
|
| 147 |
|
|
|
|
| 174 |
static struct ggml_backend_i ggml_backend_amx_i = {
|
| 175 |
/* .get_name = */ ggml_backend_amx_name,
|
| 176 |
/* .free = */ ggml_backend_amx_free,
|
|
|
|
| 177 |
/* .set_tensor_async = */ NULL,
|
| 178 |
/* .get_tensor_async = */ NULL,
|
| 179 |
/* .cpy_tensor_async = */ NULL,
|
|
|
|
| 183 |
/* .graph_plan_update = */ NULL,
|
| 184 |
/* .graph_plan_compute = */ NULL,
|
| 185 |
/* .graph_compute = */ ggml_backend_amx_graph_compute,
|
|
|
|
|
|
|
|
|
|
| 186 |
/* .event_record = */ NULL,
|
| 187 |
/* .event_wait = */ NULL,
|
| 188 |
};
|
|
|
|
| 262 |
}
|
| 263 |
|
| 264 |
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
|
| 265 |
+
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
|
| 266 |
|
| 267 |
GGML_UNUSED(dev);
|
| 268 |
}
|
ggml/src/ggml-backend-impl.h
CHANGED
|
@@ -22,7 +22,7 @@ extern "C" {
|
|
| 22 |
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
|
| 23 |
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
|
| 24 |
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
| 25 |
-
// (optional) check if tensor data is in host memory (defaults to false)
|
| 26 |
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
| 27 |
};
|
| 28 |
|
|
@@ -37,7 +37,6 @@ extern "C" {
|
|
| 37 |
//
|
| 38 |
|
| 39 |
struct ggml_backend_buffer_i {
|
| 40 |
-
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
| 41 |
// (optional) free the buffer
|
| 42 |
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
| 43 |
// base address of the buffer
|
|
@@ -88,19 +87,16 @@ extern "C" {
|
|
| 88 |
|
| 89 |
void (*free)(ggml_backend_t backend);
|
| 90 |
|
| 91 |
-
// Will be moved to the device interface
|
| 92 |
-
// buffer allocation
|
| 93 |
-
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
| 94 |
-
|
| 95 |
// (optional) asynchronous tensor data access
|
| 96 |
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 97 |
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 98 |
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 99 |
|
| 100 |
-
// (optional) complete all pending operations
|
| 101 |
void (*synchronize)(ggml_backend_t backend);
|
| 102 |
|
| 103 |
-
// (optional)
|
|
|
|
| 104 |
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 105 |
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 106 |
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
|
@@ -111,13 +107,6 @@ extern "C" {
|
|
| 111 |
// compute graph (always async if supported by the backend)
|
| 112 |
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 113 |
|
| 114 |
-
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
|
| 115 |
-
// new backends should implement the device interface instead
|
| 116 |
-
// These functions are being moved to the device interface
|
| 117 |
-
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
| 118 |
-
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
| 119 |
-
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
| 120 |
-
|
| 121 |
// (optional) event synchronization
|
| 122 |
// record an event on this stream
|
| 123 |
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
|
|
|
| 22 |
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
|
| 23 |
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
|
| 24 |
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
| 25 |
+
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
|
| 26 |
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
| 27 |
};
|
| 28 |
|
|
|
|
| 37 |
//
|
| 38 |
|
| 39 |
struct ggml_backend_buffer_i {
|
|
|
|
| 40 |
// (optional) free the buffer
|
| 41 |
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
| 42 |
// base address of the buffer
|
|
|
|
| 87 |
|
| 88 |
void (*free)(ggml_backend_t backend);
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
// (optional) asynchronous tensor data access
|
| 91 |
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 92 |
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 93 |
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 94 |
|
| 95 |
+
// (optional) complete all pending operations (required if the backend supports async operations)
|
| 96 |
void (*synchronize)(ggml_backend_t backend);
|
| 97 |
|
| 98 |
+
// (optional) graph plans (not used currently)
|
| 99 |
+
// compute graph with a plan
|
| 100 |
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 101 |
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 102 |
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
|
|
|
| 107 |
// compute graph (always async if supported by the backend)
|
| 108 |
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
// (optional) event synchronization
|
| 111 |
// record an event on this stream
|
| 112 |
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
ggml/src/ggml-backend.cpp
CHANGED
|
@@ -34,6 +34,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
| 34 |
}
|
| 35 |
|
| 36 |
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
return buft->iface.alloc_buffer(buft, size);
|
| 38 |
}
|
| 39 |
|
|
@@ -89,7 +94,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
| 89 |
}
|
| 90 |
|
| 91 |
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
| 92 |
-
return
|
| 93 |
}
|
| 94 |
|
| 95 |
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
@@ -108,6 +113,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
|
| 108 |
}
|
| 109 |
|
| 110 |
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
void * base = buffer->iface.get_base(buffer);
|
| 112 |
|
| 113 |
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
|
@@ -122,6 +132,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|
| 122 |
}
|
| 123 |
}
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
| 126 |
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
| 127 |
}
|
|
@@ -134,10 +153,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|
| 134 |
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
| 135 |
}
|
| 136 |
|
| 137 |
-
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 138 |
-
buffer->iface.clear(buffer, value);
|
| 139 |
-
}
|
| 140 |
-
|
| 141 |
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
| 142 |
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
| 143 |
}
|
|
@@ -198,7 +213,7 @@ void ggml_backend_free(ggml_backend_t backend) {
|
|
| 198 |
}
|
| 199 |
|
| 200 |
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
| 201 |
-
return backend->
|
| 202 |
}
|
| 203 |
|
| 204 |
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
|
@@ -238,43 +253,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
| 238 |
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 239 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 242 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 243 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 244 |
|
| 245 |
-
if (!size) {
|
| 246 |
-
return;
|
| 247 |
-
}
|
| 248 |
-
|
| 249 |
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
| 250 |
}
|
| 251 |
|
| 252 |
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 253 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 256 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 257 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 258 |
|
| 259 |
-
if (!size) {
|
| 260 |
-
return;
|
| 261 |
-
}
|
| 262 |
-
|
| 263 |
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
| 264 |
}
|
| 265 |
|
| 266 |
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
| 267 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 268 |
|
| 269 |
-
|
| 270 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 271 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 272 |
-
|
| 273 |
-
if (!size) {
|
| 274 |
return;
|
| 275 |
}
|
| 276 |
|
| 277 |
-
GGML_ASSERT(buf
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
| 280 |
}
|
|
@@ -316,32 +330,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
|
|
| 316 |
}
|
| 317 |
|
| 318 |
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 319 |
-
|
| 320 |
-
if (backend->device) {
|
| 321 |
-
return ggml_backend_dev_supports_op(backend->device, op);
|
| 322 |
-
}
|
| 323 |
-
|
| 324 |
-
return backend->iface.supports_op(backend, op);
|
| 325 |
}
|
| 326 |
|
| 327 |
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 328 |
-
|
| 329 |
-
if (backend->device) {
|
| 330 |
-
return ggml_backend_dev_supports_buft(backend->device, buft);
|
| 331 |
-
}
|
| 332 |
-
return backend->iface.supports_buft(backend, buft);
|
| 333 |
}
|
| 334 |
|
| 335 |
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 336 |
-
|
| 337 |
-
if (backend->device) {
|
| 338 |
-
return ggml_backend_dev_offload_op(backend->device, op);
|
| 339 |
-
}
|
| 340 |
-
|
| 341 |
-
if (backend->iface.offload_op != NULL) {
|
| 342 |
-
return backend->iface.offload_op(backend, op);
|
| 343 |
-
}
|
| 344 |
-
return false;
|
| 345 |
}
|
| 346 |
|
| 347 |
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
|
@@ -582,6 +579,9 @@ struct ggml_backend_registry {
|
|
| 582 |
#ifdef GGML_USE_VULKAN
|
| 583 |
register_backend(ggml_backend_vk_reg());
|
| 584 |
#endif
|
|
|
|
|
|
|
|
|
|
| 585 |
#ifdef GGML_USE_BLAS
|
| 586 |
register_backend(ggml_backend_blas_reg());
|
| 587 |
#endif
|
|
@@ -591,9 +591,6 @@ struct ggml_backend_registry {
|
|
| 591 |
#ifdef GGML_USE_AMX
|
| 592 |
register_backend(ggml_backend_amx_reg());
|
| 593 |
#endif
|
| 594 |
-
#ifdef GGML_USE_CANN
|
| 595 |
-
register_backend(ggml_backend_cann_reg());
|
| 596 |
-
#endif
|
| 597 |
|
| 598 |
// TODO: kompute
|
| 599 |
|
|
@@ -701,9 +698,9 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
|
|
| 701 |
}
|
| 702 |
|
| 703 |
ggml_backend_t ggml_backend_init_best(void) {
|
| 704 |
-
ggml_backend_dev_t dev = ggml_backend_dev_by_type(
|
| 705 |
if (!dev) {
|
| 706 |
-
dev = ggml_backend_dev_by_type(
|
| 707 |
}
|
| 708 |
if (!dev) {
|
| 709 |
return NULL;
|
|
@@ -711,13 +708,7 @@ ggml_backend_t ggml_backend_init_best(void) {
|
|
| 711 |
return ggml_backend_dev_init(dev, NULL);
|
| 712 |
}
|
| 713 |
|
| 714 |
-
// backend
|
| 715 |
-
|
| 716 |
-
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 717 |
-
return "CPU";
|
| 718 |
-
|
| 719 |
-
GGML_UNUSED(buffer);
|
| 720 |
-
}
|
| 721 |
|
| 722 |
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 723 |
uintptr_t data = (uintptr_t)buffer->context;
|
|
@@ -767,7 +758,6 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|
| 767 |
}
|
| 768 |
|
| 769 |
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
| 770 |
-
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
| 771 |
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
| 772 |
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 773 |
/* .init_tensor = */ NULL, // no initialization required
|
|
@@ -780,7 +770,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
|
| 780 |
};
|
| 781 |
|
| 782 |
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
| 783 |
-
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
| 784 |
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
| 785 |
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 786 |
/* .init_tensor = */ NULL, // no initialization required
|
|
@@ -792,6 +781,8 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
|
| 792 |
/* .reset = */ NULL,
|
| 793 |
};
|
| 794 |
|
|
|
|
|
|
|
| 795 |
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 796 |
return "CPU";
|
| 797 |
|
|
@@ -799,19 +790,14 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
|
|
| 799 |
}
|
| 800 |
|
| 801 |
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 802 |
-
|
| 803 |
-
if (alloc_size == 0) {
|
| 804 |
-
alloc_size = 1;
|
| 805 |
-
}
|
| 806 |
-
|
| 807 |
-
void * data = ggml_aligned_malloc(alloc_size);
|
| 808 |
|
| 809 |
if (data == NULL) {
|
| 810 |
-
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__,
|
| 811 |
return NULL;
|
| 812 |
}
|
| 813 |
|
| 814 |
-
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data,
|
| 815 |
}
|
| 816 |
|
| 817 |
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
@@ -843,6 +829,29 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
| 843 |
return &ggml_backend_cpu_buffer_type;
|
| 844 |
}
|
| 845 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 846 |
#ifdef GGML_USE_CPU_HBM
|
| 847 |
|
| 848 |
// buffer type HBM
|
|
@@ -855,18 +864,11 @@ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffe
|
|
| 855 |
GGML_UNUSED(buft);
|
| 856 |
}
|
| 857 |
|
| 858 |
-
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
| 859 |
-
return "CPU_HBM";
|
| 860 |
-
|
| 861 |
-
GGML_UNUSED(buf);
|
| 862 |
-
}
|
| 863 |
-
|
| 864 |
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 865 |
hbw_free(buffer->context);
|
| 866 |
}
|
| 867 |
|
| 868 |
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 869 |
-
//void * ptr = hbw_malloc(size);
|
| 870 |
void * ptr;
|
| 871 |
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
| 872 |
if (result != 0) {
|
|
@@ -876,7 +878,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
|
| 876 |
|
| 877 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 878 |
buffer->buft = buft;
|
| 879 |
-
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
| 880 |
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
| 881 |
|
| 882 |
return buffer;
|
|
@@ -899,6 +900,21 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
| 899 |
}
|
| 900 |
#endif
|
| 901 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 902 |
struct ggml_backend_cpu_context {
|
| 903 |
int n_threads;
|
| 904 |
ggml_threadpool_t threadpool;
|
|
@@ -923,12 +939,6 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
|
| 923 |
delete backend;
|
| 924 |
}
|
| 925 |
|
| 926 |
-
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
| 927 |
-
return ggml_backend_cpu_buffer_type();
|
| 928 |
-
|
| 929 |
-
GGML_UNUSED(backend);
|
| 930 |
-
}
|
| 931 |
-
|
| 932 |
struct ggml_backend_plan_cpu {
|
| 933 |
struct ggml_cplan cplan;
|
| 934 |
struct ggml_cgraph cgraph;
|
|
@@ -998,7 +1008,6 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
|
|
| 998 |
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
| 999 |
/* .get_name = */ ggml_backend_cpu_get_name,
|
| 1000 |
/* .free = */ ggml_backend_cpu_free,
|
| 1001 |
-
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
| 1002 |
/* .set_tensor_async = */ NULL,
|
| 1003 |
/* .get_tensor_async = */ NULL,
|
| 1004 |
/* .cpy_tensor_async = */ NULL,
|
|
@@ -1008,9 +1017,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|
| 1008 |
/* .graph_plan_update = */ NULL,
|
| 1009 |
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
| 1010 |
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
| 1011 |
-
/* .supports_op = */ NULL,
|
| 1012 |
-
/* .supports_buft = */ NULL,
|
| 1013 |
-
/* .offload_op = */ NULL,
|
| 1014 |
/* .event_record = */ NULL,
|
| 1015 |
/* .event_wait = */ NULL,
|
| 1016 |
};
|
|
@@ -1081,10 +1087,10 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
|
|
| 1081 |
|
| 1082 |
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
| 1083 |
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
| 1084 |
-
return ggml_backend_buffer_init(
|
| 1085 |
}
|
| 1086 |
|
| 1087 |
-
|
| 1088 |
|
| 1089 |
struct ggml_backend_cpu_device_context {
|
| 1090 |
std::string description = "CPU";
|
|
@@ -1171,7 +1177,7 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|
| 1171 |
}
|
| 1172 |
|
| 1173 |
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
| 1174 |
-
return
|
| 1175 |
|
| 1176 |
GGML_UNUSED(dev);
|
| 1177 |
}
|
|
@@ -1189,7 +1195,7 @@ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggm
|
|
| 1189 |
};
|
| 1190 |
}
|
| 1191 |
|
| 1192 |
-
static ggml_backend_t
|
| 1193 |
return ggml_backend_cpu_init();
|
| 1194 |
|
| 1195 |
GGML_UNUSED(dev);
|
|
@@ -1202,7 +1208,7 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_b
|
|
| 1202 |
GGML_UNUSED(dev);
|
| 1203 |
}
|
| 1204 |
|
| 1205 |
-
static ggml_backend_buffer_t
|
| 1206 |
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 1207 |
|
| 1208 |
GGML_UNUSED(dev);
|
|
@@ -1244,10 +1250,10 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
|
| 1244 |
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
| 1245 |
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
| 1246 |
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
| 1247 |
-
/* .init_backend = */
|
| 1248 |
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
| 1249 |
/* .get_host_buffer_type = */ NULL,
|
| 1250 |
-
/* .buffer_from_host_ptr = */
|
| 1251 |
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
| 1252 |
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
| 1253 |
/* .offload_op = */ NULL,
|
|
@@ -1256,7 +1262,7 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
|
| 1256 |
/* .event_synchronize = */ NULL,
|
| 1257 |
};
|
| 1258 |
|
| 1259 |
-
|
| 1260 |
|
| 1261 |
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
| 1262 |
return "CPU";
|
|
@@ -1287,6 +1293,10 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
|
|
| 1287 |
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
| 1288 |
return (void *)ggml_backend_cpu_set_n_threads;
|
| 1289 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1290 |
return NULL;
|
| 1291 |
|
| 1292 |
GGML_UNUSED(reg);
|
|
@@ -1315,12 +1325,6 @@ struct ggml_backend_multi_buffer_context {
|
|
| 1315 |
size_t n_buffers;
|
| 1316 |
};
|
| 1317 |
|
| 1318 |
-
static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 1319 |
-
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
| 1320 |
-
|
| 1321 |
-
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
| 1322 |
-
}
|
| 1323 |
-
|
| 1324 |
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1325 |
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
| 1326 |
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
@@ -1339,7 +1343,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
|
| 1339 |
}
|
| 1340 |
|
| 1341 |
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
| 1342 |
-
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
| 1343 |
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
| 1344 |
/* .get_base = */ NULL,
|
| 1345 |
/* .init_tensor = */ NULL,
|
|
@@ -1368,7 +1371,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
|
|
| 1368 |
}
|
| 1369 |
|
| 1370 |
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
| 1371 |
-
return buffer->iface.
|
| 1372 |
}
|
| 1373 |
|
| 1374 |
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
|
@@ -1460,7 +1463,7 @@ struct ggml_backend_sched {
|
|
| 1460 |
char * context_buffer;
|
| 1461 |
size_t context_buffer_size;
|
| 1462 |
|
| 1463 |
-
|
| 1464 |
};
|
| 1465 |
|
| 1466 |
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
|
@@ -1500,7 +1503,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
|
| 1500 |
return -1;
|
| 1501 |
}
|
| 1502 |
|
| 1503 |
-
#if
|
| 1504 |
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
| 1505 |
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
| 1506 |
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
|
@@ -1548,7 +1551,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
| 1548 |
if (src == NULL) {
|
| 1549 |
continue;
|
| 1550 |
}
|
| 1551 |
-
|
|
|
|
|
|
|
| 1552 |
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
| 1553 |
// check if a backend with higher prio wants to offload the op
|
| 1554 |
if (src_backend_id == sched->n_backends - 1) {
|
|
@@ -1595,19 +1600,21 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
| 1595 |
if (ggml_is_view_op(node->op)) {
|
| 1596 |
continue;
|
| 1597 |
}
|
| 1598 |
-
|
| 1599 |
-
|
| 1600 |
-
|
| 1601 |
-
|
| 1602 |
-
|
| 1603 |
-
|
| 1604 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1605 |
}
|
| 1606 |
-
|
| 1607 |
-
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
| 1608 |
-
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 1609 |
}
|
| 1610 |
-
GGML_LOG_DEBUG("\n");
|
| 1611 |
}
|
| 1612 |
}
|
| 1613 |
|
|
@@ -1899,11 +1906,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1899 |
if (src == NULL) {
|
| 1900 |
continue;
|
| 1901 |
}
|
| 1902 |
-
// check if a weight is on a different backend
|
| 1903 |
// by starting a new split, the memory of the previously offloaded weights can be reused
|
| 1904 |
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1905 |
int src_backend_id = tensor_backend_id(src);
|
| 1906 |
-
if (src_backend_id != cur_backend_id) {
|
| 1907 |
need_new_split = true;
|
| 1908 |
break;
|
| 1909 |
}
|
|
@@ -1915,7 +1922,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1915 |
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
| 1916 |
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
| 1917 |
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
| 1918 |
-
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
| 1919 |
need_new_split = true;
|
| 1920 |
break;
|
| 1921 |
}
|
|
@@ -2240,7 +2246,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
| 2240 |
|
| 2241 |
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
| 2242 |
|
| 2243 |
-
|
|
|
|
| 2244 |
sched->n_backends = n_backends;
|
| 2245 |
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
| 2246 |
|
|
|
|
| 34 |
}
|
| 35 |
|
| 36 |
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 37 |
+
if (size == 0) {
|
| 38 |
+
// return a dummy buffer for zero-sized allocations
|
| 39 |
+
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
return buft->iface.alloc_buffer(buft, size);
|
| 43 |
}
|
| 44 |
|
|
|
|
| 94 |
}
|
| 95 |
|
| 96 |
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
| 97 |
+
return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
|
| 98 |
}
|
| 99 |
|
| 100 |
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
|
|
| 113 |
}
|
| 114 |
|
| 115 |
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 116 |
+
// get_base is optional if the buffer is zero-sized
|
| 117 |
+
if (buffer->size == 0) {
|
| 118 |
+
return NULL;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
void * base = buffer->iface.get_base(buffer);
|
| 122 |
|
| 123 |
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
|
|
|
| 132 |
}
|
| 133 |
}
|
| 134 |
|
| 135 |
+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 136 |
+
// clear is optional if the buffer is zero-sized
|
| 137 |
+
if (buffer->size == 0) {
|
| 138 |
+
return;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
buffer->iface.clear(buffer, value);
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
| 145 |
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
| 146 |
}
|
|
|
|
| 153 |
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
| 154 |
}
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
| 157 |
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
| 158 |
}
|
|
|
|
| 213 |
}
|
| 214 |
|
| 215 |
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
| 216 |
+
return ggml_backend_dev_buffer_type(backend->device);
|
| 217 |
}
|
| 218 |
|
| 219 |
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
|
|
|
| 253 |
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 254 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 255 |
|
| 256 |
+
if (size == 0) {
|
| 257 |
+
return;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 261 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 262 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
| 265 |
}
|
| 266 |
|
| 267 |
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 268 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 269 |
|
| 270 |
+
if (size == 0) {
|
| 271 |
+
return;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 275 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 276 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
| 279 |
}
|
| 280 |
|
| 281 |
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
| 282 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 283 |
|
| 284 |
+
if (size == 0) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
return;
|
| 286 |
}
|
| 287 |
|
| 288 |
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 289 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 290 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 291 |
+
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
|
| 292 |
|
| 293 |
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
| 294 |
}
|
|
|
|
| 330 |
}
|
| 331 |
|
| 332 |
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 333 |
+
return ggml_backend_dev_supports_op(backend->device, op);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
}
|
| 335 |
|
| 336 |
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 337 |
+
return ggml_backend_dev_supports_buft(backend->device, buft);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
}
|
| 339 |
|
| 340 |
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 341 |
+
return ggml_backend_dev_offload_op(backend->device, op);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
}
|
| 343 |
|
| 344 |
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
|
|
|
| 579 |
#ifdef GGML_USE_VULKAN
|
| 580 |
register_backend(ggml_backend_vk_reg());
|
| 581 |
#endif
|
| 582 |
+
#ifdef GGML_USE_CANN
|
| 583 |
+
register_backend(ggml_backend_cann_reg());
|
| 584 |
+
#endif
|
| 585 |
#ifdef GGML_USE_BLAS
|
| 586 |
register_backend(ggml_backend_blas_reg());
|
| 587 |
#endif
|
|
|
|
| 591 |
#ifdef GGML_USE_AMX
|
| 592 |
register_backend(ggml_backend_amx_reg());
|
| 593 |
#endif
|
|
|
|
|
|
|
|
|
|
| 594 |
|
| 595 |
// TODO: kompute
|
| 596 |
|
|
|
|
| 698 |
}
|
| 699 |
|
| 700 |
ggml_backend_t ggml_backend_init_best(void) {
|
| 701 |
+
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
| 702 |
if (!dev) {
|
| 703 |
+
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
| 704 |
}
|
| 705 |
if (!dev) {
|
| 706 |
return NULL;
|
|
|
|
| 708 |
return ggml_backend_dev_init(dev, NULL);
|
| 709 |
}
|
| 710 |
|
| 711 |
+
// CPU backend - buffer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 712 |
|
| 713 |
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 714 |
uintptr_t data = (uintptr_t)buffer->context;
|
|
|
|
| 758 |
}
|
| 759 |
|
| 760 |
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
|
|
|
| 761 |
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
| 762 |
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 763 |
/* .init_tensor = */ NULL, // no initialization required
|
|
|
|
| 770 |
};
|
| 771 |
|
| 772 |
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
|
|
|
| 773 |
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
| 774 |
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 775 |
/* .init_tensor = */ NULL, // no initialization required
|
|
|
|
| 781 |
/* .reset = */ NULL,
|
| 782 |
};
|
| 783 |
|
| 784 |
+
// CPU backend - buffer type
|
| 785 |
+
|
| 786 |
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 787 |
return "CPU";
|
| 788 |
|
|
|
|
| 790 |
}
|
| 791 |
|
| 792 |
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 793 |
+
void * data = ggml_aligned_malloc(size);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 794 |
|
| 795 |
if (data == NULL) {
|
| 796 |
+
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
| 797 |
return NULL;
|
| 798 |
}
|
| 799 |
|
| 800 |
+
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
| 801 |
}
|
| 802 |
|
| 803 |
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
|
|
| 829 |
return &ggml_backend_cpu_buffer_type;
|
| 830 |
}
|
| 831 |
|
| 832 |
+
static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 833 |
+
return "CPU_Mapped";
|
| 834 |
+
|
| 835 |
+
GGML_UNUSED(buft);
|
| 836 |
+
}
|
| 837 |
+
|
| 838 |
+
static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
|
| 839 |
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
| 840 |
+
/* .iface = */ {
|
| 841 |
+
/* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
|
| 842 |
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
| 843 |
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 844 |
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 845 |
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 846 |
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 847 |
+
},
|
| 848 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
| 849 |
+
/* .context = */ NULL,
|
| 850 |
+
};
|
| 851 |
+
|
| 852 |
+
return &ggml_backend_cpu_buffer_type;
|
| 853 |
+
}
|
| 854 |
+
|
| 855 |
#ifdef GGML_USE_CPU_HBM
|
| 856 |
|
| 857 |
// buffer type HBM
|
|
|
|
| 864 |
GGML_UNUSED(buft);
|
| 865 |
}
|
| 866 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 867 |
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 868 |
hbw_free(buffer->context);
|
| 869 |
}
|
| 870 |
|
| 871 |
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
|
|
| 872 |
void * ptr;
|
| 873 |
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
| 874 |
if (result != 0) {
|
|
|
|
| 878 |
|
| 879 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 880 |
buffer->buft = buft;
|
|
|
|
| 881 |
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
| 882 |
|
| 883 |
return buffer;
|
|
|
|
| 900 |
}
|
| 901 |
#endif
|
| 902 |
|
| 903 |
+
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
|
| 904 |
+
static ggml_backend_buffer_type_t bufts[] = {
|
| 905 |
+
#ifdef GGML_USE_CPU_HBM
|
| 906 |
+
ggml_backend_cpu_hbm_buffer_type(),
|
| 907 |
+
#endif
|
| 908 |
+
NULL
|
| 909 |
+
};
|
| 910 |
+
|
| 911 |
+
return bufts;
|
| 912 |
+
|
| 913 |
+
GGML_UNUSED(device);
|
| 914 |
+
}
|
| 915 |
+
|
| 916 |
+
// CPU backend - backend (stream)
|
| 917 |
+
|
| 918 |
struct ggml_backend_cpu_context {
|
| 919 |
int n_threads;
|
| 920 |
ggml_threadpool_t threadpool;
|
|
|
|
| 939 |
delete backend;
|
| 940 |
}
|
| 941 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 942 |
struct ggml_backend_plan_cpu {
|
| 943 |
struct ggml_cplan cplan;
|
| 944 |
struct ggml_cgraph cgraph;
|
|
|
|
| 1008 |
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
| 1009 |
/* .get_name = */ ggml_backend_cpu_get_name,
|
| 1010 |
/* .free = */ ggml_backend_cpu_free,
|
|
|
|
| 1011 |
/* .set_tensor_async = */ NULL,
|
| 1012 |
/* .get_tensor_async = */ NULL,
|
| 1013 |
/* .cpy_tensor_async = */ NULL,
|
|
|
|
| 1017 |
/* .graph_plan_update = */ NULL,
|
| 1018 |
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
| 1019 |
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
|
|
|
|
|
|
|
|
| 1020 |
/* .event_record = */ NULL,
|
| 1021 |
/* .event_wait = */ NULL,
|
| 1022 |
};
|
|
|
|
| 1087 |
|
| 1088 |
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
| 1089 |
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
| 1090 |
+
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
| 1091 |
}
|
| 1092 |
|
| 1093 |
+
// CPU backend - device
|
| 1094 |
|
| 1095 |
struct ggml_backend_cpu_device_context {
|
| 1096 |
std::string description = "CPU";
|
|
|
|
| 1177 |
}
|
| 1178 |
|
| 1179 |
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
| 1180 |
+
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
| 1181 |
|
| 1182 |
GGML_UNUSED(dev);
|
| 1183 |
}
|
|
|
|
| 1195 |
};
|
| 1196 |
}
|
| 1197 |
|
| 1198 |
+
static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
| 1199 |
return ggml_backend_cpu_init();
|
| 1200 |
|
| 1201 |
GGML_UNUSED(dev);
|
|
|
|
| 1208 |
GGML_UNUSED(dev);
|
| 1209 |
}
|
| 1210 |
|
| 1211 |
+
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
| 1212 |
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 1213 |
|
| 1214 |
GGML_UNUSED(dev);
|
|
|
|
| 1250 |
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
| 1251 |
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
| 1252 |
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
| 1253 |
+
/* .init_backend = */ ggml_backend_cpu_device_init_backend,
|
| 1254 |
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
| 1255 |
/* .get_host_buffer_type = */ NULL,
|
| 1256 |
+
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
|
| 1257 |
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
| 1258 |
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
| 1259 |
/* .offload_op = */ NULL,
|
|
|
|
| 1262 |
/* .event_synchronize = */ NULL,
|
| 1263 |
};
|
| 1264 |
|
| 1265 |
+
// CPU backend - backend (reg)
|
| 1266 |
|
| 1267 |
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
| 1268 |
return "CPU";
|
|
|
|
| 1293 |
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
| 1294 |
return (void *)ggml_backend_cpu_set_n_threads;
|
| 1295 |
}
|
| 1296 |
+
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
| 1297 |
+
return (void *)ggml_backend_cpu_get_extra_bufts;
|
| 1298 |
+
}
|
| 1299 |
+
|
| 1300 |
return NULL;
|
| 1301 |
|
| 1302 |
GGML_UNUSED(reg);
|
|
|
|
| 1325 |
size_t n_buffers;
|
| 1326 |
};
|
| 1327 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1328 |
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1329 |
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
| 1330 |
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
|
|
| 1343 |
}
|
| 1344 |
|
| 1345 |
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
|
|
|
| 1346 |
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
| 1347 |
/* .get_base = */ NULL,
|
| 1348 |
/* .init_tensor = */ NULL,
|
|
|
|
| 1371 |
}
|
| 1372 |
|
| 1373 |
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
| 1374 |
+
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
|
| 1375 |
}
|
| 1376 |
|
| 1377 |
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
|
|
|
| 1463 |
char * context_buffer;
|
| 1464 |
size_t context_buffer_size;
|
| 1465 |
|
| 1466 |
+
int debug;
|
| 1467 |
};
|
| 1468 |
|
| 1469 |
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
|
|
|
| 1503 |
return -1;
|
| 1504 |
}
|
| 1505 |
|
| 1506 |
+
#if 1
|
| 1507 |
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
| 1508 |
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
| 1509 |
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
|
|
|
| 1551 |
if (src == NULL) {
|
| 1552 |
continue;
|
| 1553 |
}
|
| 1554 |
+
// skip ROPE since the rope freqs tensor is too small to choose a backend based on it
|
| 1555 |
+
// not an ideal solution
|
| 1556 |
+
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1557 |
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
| 1558 |
// check if a backend with higher prio wants to offload the op
|
| 1559 |
if (src_backend_id == sched->n_backends - 1) {
|
|
|
|
| 1600 |
if (ggml_is_view_op(node->op)) {
|
| 1601 |
continue;
|
| 1602 |
}
|
| 1603 |
+
if (sched->debug > 1) {
|
| 1604 |
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
| 1605 |
+
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
| 1606 |
+
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
| 1607 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1608 |
+
struct ggml_tensor * src = node->src[j];
|
| 1609 |
+
if (src == NULL) {
|
| 1610 |
+
continue;
|
| 1611 |
+
}
|
| 1612 |
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
| 1613 |
+
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
| 1614 |
+
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 1615 |
}
|
| 1616 |
+
GGML_LOG_DEBUG("\n");
|
|
|
|
|
|
|
| 1617 |
}
|
|
|
|
| 1618 |
}
|
| 1619 |
}
|
| 1620 |
|
|
|
|
| 1906 |
if (src == NULL) {
|
| 1907 |
continue;
|
| 1908 |
}
|
| 1909 |
+
// check if a weight is on a different and incompatible backend
|
| 1910 |
// by starting a new split, the memory of the previously offloaded weights can be reused
|
| 1911 |
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1912 |
int src_backend_id = tensor_backend_id(src);
|
| 1913 |
+
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
| 1914 |
need_new_split = true;
|
| 1915 |
break;
|
| 1916 |
}
|
|
|
|
| 1922 |
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
| 1923 |
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
| 1924 |
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
|
|
|
| 1925 |
need_new_split = true;
|
| 1926 |
break;
|
| 1927 |
}
|
|
|
|
| 2246 |
|
| 2247 |
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
| 2248 |
|
| 2249 |
+
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
|
| 2250 |
+
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
|
| 2251 |
sched->n_backends = n_backends;
|
| 2252 |
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
| 2253 |
|
ggml/src/ggml-blas.cpp
CHANGED
|
@@ -224,12 +224,6 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
|
|
| 224 |
delete backend;
|
| 225 |
}
|
| 226 |
|
| 227 |
-
static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
|
| 228 |
-
return ggml_backend_cpu_buffer_type();
|
| 229 |
-
|
| 230 |
-
GGML_UNUSED(backend);
|
| 231 |
-
}
|
| 232 |
-
|
| 233 |
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 234 |
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
| 235 |
|
|
@@ -265,7 +259,6 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
|
|
| 265 |
static struct ggml_backend_i blas_backend_i = {
|
| 266 |
/* .get_name = */ ggml_backend_blas_get_name,
|
| 267 |
/* .free = */ ggml_backend_blas_free,
|
| 268 |
-
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
|
| 269 |
/* .set_tensor_async = */ NULL,
|
| 270 |
/* .get_tensor_async = */ NULL,
|
| 271 |
/* .cpy_tensor_async = */ NULL,
|
|
@@ -275,9 +268,6 @@ static struct ggml_backend_i blas_backend_i = {
|
|
| 275 |
/* .graph_plan_update = */ NULL,
|
| 276 |
/* .graph_plan_compute = */ NULL,
|
| 277 |
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
| 278 |
-
/* .supports_op = */ NULL,
|
| 279 |
-
/* .supports_buft = */ NULL,
|
| 280 |
-
/* .offload_op = */ NULL,
|
| 281 |
/* .event_record = */ NULL,
|
| 282 |
/* .event_wait = */ NULL,
|
| 283 |
};
|
|
@@ -356,7 +346,7 @@ static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|
| 356 |
}
|
| 357 |
|
| 358 |
static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
|
| 359 |
-
return
|
| 360 |
|
| 361 |
GGML_UNUSED(dev);
|
| 362 |
}
|
|
@@ -374,7 +364,7 @@ static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct gg
|
|
| 374 |
};
|
| 375 |
}
|
| 376 |
|
| 377 |
-
static ggml_backend_t
|
| 378 |
return ggml_backend_blas_init();
|
| 379 |
|
| 380 |
GGML_UNUSED(dev);
|
|
@@ -387,7 +377,7 @@ static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_
|
|
| 387 |
GGML_UNUSED(dev);
|
| 388 |
}
|
| 389 |
|
| 390 |
-
static ggml_backend_buffer_t
|
| 391 |
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 392 |
|
| 393 |
GGML_UNUSED(dev);
|
|
@@ -456,10 +446,10 @@ static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
|
|
| 456 |
/* .get_memory = */ ggml_backend_blas_device_get_memory,
|
| 457 |
/* .get_type = */ ggml_backend_blas_device_get_type,
|
| 458 |
/* .get_props = */ ggml_backend_blas_device_get_props,
|
| 459 |
-
/* .init_backend = */
|
| 460 |
/* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
|
| 461 |
/* .get_host_buffer_type = */ NULL,
|
| 462 |
-
/* .buffer_from_host_ptr = */
|
| 463 |
/* .supports_op = */ ggml_backend_blas_device_supports_op,
|
| 464 |
/* .supports_buft = */ ggml_backend_blas_device_supports_buft,
|
| 465 |
/* .offload_op = */ NULL,
|
|
|
|
| 224 |
delete backend;
|
| 225 |
}
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 228 |
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
| 229 |
|
|
|
|
| 259 |
static struct ggml_backend_i blas_backend_i = {
|
| 260 |
/* .get_name = */ ggml_backend_blas_get_name,
|
| 261 |
/* .free = */ ggml_backend_blas_free,
|
|
|
|
| 262 |
/* .set_tensor_async = */ NULL,
|
| 263 |
/* .get_tensor_async = */ NULL,
|
| 264 |
/* .cpy_tensor_async = */ NULL,
|
|
|
|
| 268 |
/* .graph_plan_update = */ NULL,
|
| 269 |
/* .graph_plan_compute = */ NULL,
|
| 270 |
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
|
|
|
|
|
|
|
|
|
| 271 |
/* .event_record = */ NULL,
|
| 272 |
/* .event_wait = */ NULL,
|
| 273 |
};
|
|
|
|
| 346 |
}
|
| 347 |
|
| 348 |
static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
|
| 349 |
+
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
|
| 350 |
|
| 351 |
GGML_UNUSED(dev);
|
| 352 |
}
|
|
|
|
| 364 |
};
|
| 365 |
}
|
| 366 |
|
| 367 |
+
static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
| 368 |
return ggml_backend_blas_init();
|
| 369 |
|
| 370 |
GGML_UNUSED(dev);
|
|
|
|
| 377 |
GGML_UNUSED(dev);
|
| 378 |
}
|
| 379 |
|
| 380 |
+
static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
| 381 |
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 382 |
|
| 383 |
GGML_UNUSED(dev);
|
|
|
|
| 446 |
/* .get_memory = */ ggml_backend_blas_device_get_memory,
|
| 447 |
/* .get_type = */ ggml_backend_blas_device_get_type,
|
| 448 |
/* .get_props = */ ggml_backend_blas_device_get_props,
|
| 449 |
+
/* .init_backend = */ ggml_backend_blas_device_init_backend,
|
| 450 |
/* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
|
| 451 |
/* .get_host_buffer_type = */ NULL,
|
| 452 |
+
/* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
|
| 453 |
/* .supports_op = */ ggml_backend_blas_device_supports_op,
|
| 454 |
/* .supports_buft = */ ggml_backend_blas_device_supports_buft,
|
| 455 |
/* .offload_op = */ NULL,
|
ggml/src/ggml-cann.cpp
CHANGED
|
@@ -489,23 +489,6 @@ struct ggml_backend_cann_buffer_context {
|
|
| 489 |
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
|
| 490 |
};
|
| 491 |
|
| 492 |
-
/**
|
| 493 |
-
* @brief Retrieve the name associated with a CANN buffer.
|
| 494 |
-
*
|
| 495 |
-
* This function returns the name of a CANN buffer, which is stored in the
|
| 496 |
-
* context of the buffer.
|
| 497 |
-
*
|
| 498 |
-
* @param buffer The CANN buffer whose name is to be retrieved.
|
| 499 |
-
* @return A pointer to a C-string containing the name of the buffer.
|
| 500 |
-
*/
|
| 501 |
-
|
| 502 |
-
static const char* ggml_backend_cann_buffer_get_name(
|
| 503 |
-
ggml_backend_buffer_t buffer) {
|
| 504 |
-
return "CANN";
|
| 505 |
-
|
| 506 |
-
GGML_UNUSED(buffer);
|
| 507 |
-
}
|
| 508 |
-
|
| 509 |
/**
|
| 510 |
* @brief Check if a buffer is a CANN buffer.
|
| 511 |
*
|
|
@@ -515,9 +498,10 @@ static const char* ggml_backend_cann_buffer_get_name(
|
|
| 515 |
* @param buffer The buffer to check.
|
| 516 |
* @return true if the buffer is a CANN buffer, false otherwise.
|
| 517 |
*/
|
|
|
|
| 518 |
static bool ggml_backend_buffer_is_cann(
|
| 519 |
ggml_backend_buffer_t buffer) {
|
| 520 |
-
return buffer->
|
| 521 |
}
|
| 522 |
|
| 523 |
/**
|
|
@@ -965,7 +949,6 @@ static void ggml_backend_cann_buffer_clear(
|
|
| 965 |
* on a CANN buffer within the backend.
|
| 966 |
*/
|
| 967 |
static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
|
| 968 |
-
/* .get_name = */ ggml_backend_cann_buffer_get_name,
|
| 969 |
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
|
| 970 |
/* .get_base = */ ggml_backend_cann_buffer_get_base,
|
| 971 |
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
|
|
@@ -999,9 +982,10 @@ struct ggml_backend_cann_buffer_type_context {
|
|
| 999 |
*/
|
| 1000 |
static const char* ggml_backend_cann_buffer_type_name(
|
| 1001 |
ggml_backend_buffer_type_t buft) {
|
| 1002 |
-
|
|
|
|
| 1003 |
|
| 1004 |
-
|
| 1005 |
}
|
| 1006 |
|
| 1007 |
/**
|
|
@@ -1465,24 +1449,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|
| 1465 |
delete backend;
|
| 1466 |
}
|
| 1467 |
|
| 1468 |
-
/**
|
| 1469 |
-
* @brief Retrieves the default buffer type associated with the CANN backend.
|
| 1470 |
-
*
|
| 1471 |
-
* This function returns the buffer type specific to the device associated
|
| 1472 |
-
* with the CANN backend. It is used to allocate buffers for computations
|
| 1473 |
-
* performed by the backend.
|
| 1474 |
-
*
|
| 1475 |
-
* @param backend Pointer to the CANN backend structure.
|
| 1476 |
-
* @return Pointer to the buffer type structure for the CANN backend.
|
| 1477 |
-
*/
|
| 1478 |
-
static ggml_backend_buffer_type_t
|
| 1479 |
-
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
| 1480 |
-
ggml_backend_cann_context* cann_ctx =
|
| 1481 |
-
(ggml_backend_cann_context*)backend->context;
|
| 1482 |
-
|
| 1483 |
-
return ggml_backend_cann_buffer_type(cann_ctx->device);
|
| 1484 |
-
}
|
| 1485 |
-
|
| 1486 |
/**
|
| 1487 |
* @brief Sets tensor data asynchronously in the CANN backend.
|
| 1488 |
*
|
|
@@ -1863,7 +1829,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
|
|
| 1863 |
static const ggml_backend_i ggml_backend_cann_interface = {
|
| 1864 |
/* .get_name = */ ggml_backend_cann_name,
|
| 1865 |
/* .free = */ ggml_backend_cann_free,
|
| 1866 |
-
/* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
|
| 1867 |
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
|
| 1868 |
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
|
| 1869 |
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
|
|
@@ -1873,9 +1838,6 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
|
| 1873 |
/* .graph_plan_update = */ NULL,
|
| 1874 |
/* .graph_plan_compute = */ NULL,
|
| 1875 |
/* .graph_compute = */ ggml_backend_cann_graph_compute,
|
| 1876 |
-
/* .supports_op = */ NULL, // moved to device
|
| 1877 |
-
/* .supports_buft = */ NULL, // moved to device
|
| 1878 |
-
/* .offload_op = */ NULL, // moved to device
|
| 1879 |
/* .event_record = */ ggml_backend_cann_event_record,
|
| 1880 |
/* .event_wait = */ ggml_backend_cann_event_wait,
|
| 1881 |
};
|
|
@@ -1918,7 +1880,7 @@ static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|
| 1918 |
|
| 1919 |
static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
|
| 1920 |
GGML_UNUSED(dev);
|
| 1921 |
-
return
|
| 1922 |
}
|
| 1923 |
|
| 1924 |
static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
|
|
|
| 489 |
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
|
| 490 |
};
|
| 491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
/**
|
| 493 |
* @brief Check if a buffer is a CANN buffer.
|
| 494 |
*
|
|
|
|
| 498 |
* @param buffer The buffer to check.
|
| 499 |
* @return true if the buffer is a CANN buffer, false otherwise.
|
| 500 |
*/
|
| 501 |
+
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
|
| 502 |
static bool ggml_backend_buffer_is_cann(
|
| 503 |
ggml_backend_buffer_t buffer) {
|
| 504 |
+
return ggml_backend_buft_is_cann(buffer->buft);
|
| 505 |
}
|
| 506 |
|
| 507 |
/**
|
|
|
|
| 949 |
* on a CANN buffer within the backend.
|
| 950 |
*/
|
| 951 |
static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
|
|
|
|
| 952 |
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
|
| 953 |
/* .get_base = */ ggml_backend_cann_buffer_get_base,
|
| 954 |
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
|
|
|
|
| 982 |
*/
|
| 983 |
static const char* ggml_backend_cann_buffer_type_name(
|
| 984 |
ggml_backend_buffer_type_t buft) {
|
| 985 |
+
ggml_backend_cann_buffer_type_context* buft_ctx =
|
| 986 |
+
(ggml_backend_cann_buffer_type_context*)buft->context;
|
| 987 |
|
| 988 |
+
return buft_ctx->name.c_str();
|
| 989 |
}
|
| 990 |
|
| 991 |
/**
|
|
|
|
| 1449 |
delete backend;
|
| 1450 |
}
|
| 1451 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1452 |
/**
|
| 1453 |
* @brief Sets tensor data asynchronously in the CANN backend.
|
| 1454 |
*
|
|
|
|
| 1829 |
static const ggml_backend_i ggml_backend_cann_interface = {
|
| 1830 |
/* .get_name = */ ggml_backend_cann_name,
|
| 1831 |
/* .free = */ ggml_backend_cann_free,
|
|
|
|
| 1832 |
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
|
| 1833 |
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
|
| 1834 |
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
|
|
|
|
| 1838 |
/* .graph_plan_update = */ NULL,
|
| 1839 |
/* .graph_plan_compute = */ NULL,
|
| 1840 |
/* .graph_compute = */ ggml_backend_cann_graph_compute,
|
|
|
|
|
|
|
|
|
|
| 1841 |
/* .event_record = */ ggml_backend_cann_event_record,
|
| 1842 |
/* .event_wait = */ ggml_backend_cann_event_wait,
|
| 1843 |
};
|
|
|
|
| 1880 |
|
| 1881 |
static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
|
| 1882 |
GGML_UNUSED(dev);
|
| 1883 |
+
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
| 1884 |
}
|
| 1885 |
|
| 1886 |
static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
ggml/src/ggml-cuda.cu
CHANGED
|
@@ -421,18 +421,13 @@ struct ggml_backend_cuda_buffer_context {
|
|
| 421 |
}
|
| 422 |
};
|
| 423 |
|
| 424 |
-
static
|
| 425 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 426 |
-
|
| 427 |
}
|
| 428 |
|
| 429 |
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
| 430 |
-
return buffer->iface.
|
| 431 |
-
}
|
| 432 |
-
|
| 433 |
-
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 434 |
-
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 435 |
-
delete ctx;
|
| 436 |
}
|
| 437 |
|
| 438 |
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
@@ -515,7 +510,6 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|
| 515 |
}
|
| 516 |
|
| 517 |
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
| 518 |
-
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
| 519 |
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
| 520 |
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
| 521 |
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
|
@@ -548,8 +542,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
|
| 548 |
|
| 549 |
ggml_cuda_set_device(buft_ctx->device);
|
| 550 |
|
| 551 |
-
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
| 552 |
-
|
| 553 |
void * dev_ptr;
|
| 554 |
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
| 555 |
if (err != cudaSuccess) {
|
|
@@ -657,7 +649,9 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
|
| 657 |
}
|
| 658 |
|
| 659 |
struct ggml_backend_cuda_split_buffer_type_context {
|
|
|
|
| 660 |
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
|
|
|
| 661 |
};
|
| 662 |
|
| 663 |
struct ggml_backend_cuda_split_buffer_context {
|
|
@@ -680,16 +674,6 @@ struct ggml_backend_cuda_split_buffer_context {
|
|
| 680 |
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
| 681 |
};
|
| 682 |
|
| 683 |
-
static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 684 |
-
return GGML_CUDA_NAME "_Split";
|
| 685 |
-
|
| 686 |
-
GGML_UNUSED(buffer);
|
| 687 |
-
}
|
| 688 |
-
|
| 689 |
-
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
| 690 |
-
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
| 691 |
-
GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
| 692 |
-
}
|
| 693 |
|
| 694 |
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 695 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
@@ -833,7 +817,6 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u
|
|
| 833 |
}
|
| 834 |
|
| 835 |
static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
| 836 |
-
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
| 837 |
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
| 838 |
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
| 839 |
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
|
@@ -848,9 +831,9 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
|
| 848 |
// cuda split buffer type
|
| 849 |
|
| 850 |
static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 851 |
-
|
| 852 |
|
| 853 |
-
|
| 854 |
}
|
| 855 |
|
| 856 |
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
|
@@ -915,11 +898,11 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
|
|
| 915 |
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
| 916 |
};
|
| 917 |
|
| 918 |
-
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
| 919 |
static std::mutex mutex;
|
| 920 |
std::lock_guard<std::mutex> lock(mutex);
|
| 921 |
|
| 922 |
-
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES
|
| 923 |
|
| 924 |
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
| 925 |
|
|
@@ -937,18 +920,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
|
|
| 937 |
}
|
| 938 |
}
|
| 939 |
|
| 940 |
-
auto it = buft_map.find(tensor_split_arr);
|
| 941 |
if (it != buft_map.end()) {
|
| 942 |
return &it->second;
|
| 943 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 944 |
|
| 945 |
struct ggml_backend_buffer_type buft {
|
| 946 |
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
| 947 |
-
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(),
|
| 948 |
-
/* .context = */
|
| 949 |
};
|
| 950 |
|
| 951 |
-
auto result = buft_map.emplace(tensor_split_arr, buft);
|
| 952 |
return &result.first->second;
|
| 953 |
}
|
| 954 |
|
|
@@ -960,12 +948,6 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
|
| 960 |
GGML_UNUSED(buft);
|
| 961 |
}
|
| 962 |
|
| 963 |
-
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
| 964 |
-
return GGML_CUDA_NAME "_Host";
|
| 965 |
-
|
| 966 |
-
GGML_UNUSED(buffer);
|
| 967 |
-
}
|
| 968 |
-
|
| 969 |
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 970 |
CUDA_CHECK(cudaFreeHost(buffer->context));
|
| 971 |
}
|
|
@@ -998,7 +980,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
| 998 |
|
| 999 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 1000 |
buffer->buft = buft;
|
| 1001 |
-
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
| 1002 |
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
| 1003 |
|
| 1004 |
return buffer;
|
|
@@ -1400,7 +1381,7 @@ static void ggml_cuda_op_mul_mat(
|
|
| 1400 |
|
| 1401 |
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
| 1402 |
|
| 1403 |
-
const bool split =
|
| 1404 |
GGML_ASSERT(!(split && ne02 > 1));
|
| 1405 |
GGML_ASSERT(!(split && ne03 > 1));
|
| 1406 |
GGML_ASSERT(!(split && ne02 < ne12));
|
|
@@ -1890,7 +1871,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
| 1890 |
}
|
| 1891 |
|
| 1892 |
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 1893 |
-
const bool split =
|
| 1894 |
|
| 1895 |
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
| 1896 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
@@ -2017,7 +1998,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
| 2017 |
|
| 2018 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 2019 |
|
| 2020 |
-
GGML_ASSERT(!
|
| 2021 |
|
| 2022 |
cudaStream_t stream = ctx.stream();
|
| 2023 |
|
|
@@ -2150,7 +2131,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
| 2150 |
|
| 2151 |
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
|
| 2152 |
// why is this here instead of mul_mat?
|
| 2153 |
-
if (dst->src[0] != nullptr &&
|
| 2154 |
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
|
| 2155 |
}
|
| 2156 |
|
|
@@ -2371,12 +2352,6 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
|
| 2371 |
delete backend;
|
| 2372 |
}
|
| 2373 |
|
| 2374 |
-
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
| 2375 |
-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2376 |
-
|
| 2377 |
-
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
| 2378 |
-
}
|
| 2379 |
-
|
| 2380 |
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 2381 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2382 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
@@ -2582,7 +2557,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2582 |
continue;
|
| 2583 |
}
|
| 2584 |
|
| 2585 |
-
if (node->src[0] && node->src[0]->buffer &&
|
| 2586 |
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
| 2587 |
#ifndef NDEBUG
|
| 2588 |
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
|
@@ -2669,7 +2644,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2669 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 2670 |
if (node->src[j] != nullptr) {
|
| 2671 |
assert(node->src[j]->buffer);
|
| 2672 |
-
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
|
|
|
|
| 2673 |
}
|
| 2674 |
}
|
| 2675 |
#endif
|
|
@@ -2762,7 +2738,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2762 |
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
| 2763 |
if (stat == cudaErrorGraphExecUpdateFailure) {
|
| 2764 |
#ifndef NDEBUG
|
| 2765 |
-
|
| 2766 |
#endif
|
| 2767 |
// The pre-existing graph exec cannot be updated due to violated constraints
|
| 2768 |
// so instead clear error and re-instantiate
|
|
@@ -2811,7 +2787,6 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
|
| 2811 |
static const ggml_backend_i ggml_backend_cuda_interface = {
|
| 2812 |
/* .get_name = */ ggml_backend_cuda_get_name,
|
| 2813 |
/* .free = */ ggml_backend_cuda_free,
|
| 2814 |
-
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
| 2815 |
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
| 2816 |
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
| 2817 |
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
|
@@ -2821,9 +2796,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
|
| 2821 |
/* .graph_plan_update = */ NULL,
|
| 2822 |
/* .graph_plan_compute = */ NULL,
|
| 2823 |
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
| 2824 |
-
/* .supports_op = */ NULL, // moved to device
|
| 2825 |
-
/* .supports_buft = */ NULL, // moved to device
|
| 2826 |
-
/* .offload_op = */ NULL, // moved to device
|
| 2827 |
/* .event_record = */ ggml_backend_cuda_event_record,
|
| 2828 |
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
| 2829 |
};
|
|
@@ -2913,7 +2885,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|
| 2913 |
|
| 2914 |
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
|
| 2915 |
GGML_UNUSED(dev);
|
| 2916 |
-
return
|
| 2917 |
}
|
| 2918 |
|
| 2919 |
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
|
@@ -2937,7 +2909,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
|
| 2937 |
};
|
| 2938 |
}
|
| 2939 |
|
| 2940 |
-
static ggml_backend_t
|
| 2941 |
GGML_UNUSED(params);
|
| 2942 |
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
| 2943 |
return ggml_backend_cuda_init(ctx->device);
|
|
@@ -2953,18 +2925,29 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
|
|
| 2953 |
return ggml_backend_cuda_host_buffer_type();
|
| 2954 |
}
|
| 2955 |
|
| 2956 |
-
static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
| 2957 |
-
GGML_UNUSED(dev);
|
| 2958 |
-
GGML_UNUSED(ptr);
|
| 2959 |
-
GGML_UNUSED(size);
|
| 2960 |
-
GGML_UNUSED(max_tensor_size);
|
| 2961 |
-
return nullptr;
|
| 2962 |
-
}
|
| 2963 |
-
|
| 2964 |
// TODO: move these functions here
|
| 2965 |
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
| 2966 |
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
| 2967 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2968 |
switch (op->op) {
|
| 2969 |
case GGML_OP_UNARY:
|
| 2970 |
switch (ggml_get_unary_op(op)) {
|
|
@@ -3190,24 +3173,27 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3190 |
}
|
| 3191 |
|
| 3192 |
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
| 3193 |
-
|
| 3194 |
-
|
| 3195 |
-
}
|
| 3196 |
|
| 3197 |
-
|
| 3198 |
-
|
| 3199 |
-
|
| 3200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3201 |
}
|
| 3202 |
-
|
| 3203 |
-
return false;
|
| 3204 |
}
|
| 3205 |
|
| 3206 |
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
| 3207 |
const int min_batch_size = 32;
|
| 3208 |
|
| 3209 |
-
return (op
|
| 3210 |
-
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
| 3211 |
|
| 3212 |
GGML_UNUSED(dev);
|
| 3213 |
}
|
|
@@ -3248,10 +3234,10 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
|
| 3248 |
/* .get_memory = */ ggml_backend_cuda_device_get_memory,
|
| 3249 |
/* .get_type = */ ggml_backend_cuda_device_get_type,
|
| 3250 |
/* .get_props = */ ggml_backend_cuda_device_get_props,
|
| 3251 |
-
/* .init_backend = */
|
| 3252 |
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
|
| 3253 |
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
|
| 3254 |
-
/* .buffer_from_host_ptr = */
|
| 3255 |
/* .supports_op = */ ggml_backend_cuda_device_supports_op,
|
| 3256 |
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
|
| 3257 |
/* .offload_op = */ ggml_backend_cuda_device_offload_op,
|
|
|
|
| 421 |
}
|
| 422 |
};
|
| 423 |
|
| 424 |
+
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 425 |
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 426 |
+
delete ctx;
|
| 427 |
}
|
| 428 |
|
| 429 |
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
| 430 |
+
return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
}
|
| 432 |
|
| 433 |
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
|
| 510 |
}
|
| 511 |
|
| 512 |
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
|
|
|
| 513 |
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
| 514 |
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
| 515 |
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
|
|
|
| 542 |
|
| 543 |
ggml_cuda_set_device(buft_ctx->device);
|
| 544 |
|
|
|
|
|
|
|
| 545 |
void * dev_ptr;
|
| 546 |
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
| 547 |
if (err != cudaSuccess) {
|
|
|
|
| 649 |
}
|
| 650 |
|
| 651 |
struct ggml_backend_cuda_split_buffer_type_context {
|
| 652 |
+
int main_device;
|
| 653 |
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
| 654 |
+
std::string name;
|
| 655 |
};
|
| 656 |
|
| 657 |
struct ggml_backend_cuda_split_buffer_context {
|
|
|
|
| 674 |
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
| 675 |
};
|
| 676 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
|
| 678 |
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 679 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
|
|
| 817 |
}
|
| 818 |
|
| 819 |
static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
|
|
|
| 820 |
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
| 821 |
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
| 822 |
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
|
|
|
| 831 |
// cuda split buffer type
|
| 832 |
|
| 833 |
static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 834 |
+
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
| 835 |
|
| 836 |
+
return ctx->name.c_str();
|
| 837 |
}
|
| 838 |
|
| 839 |
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
|
|
|
| 898 |
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
| 899 |
};
|
| 900 |
|
| 901 |
+
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
|
| 902 |
static std::mutex mutex;
|
| 903 |
std::lock_guard<std::mutex> lock(mutex);
|
| 904 |
|
| 905 |
+
static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
|
| 906 |
|
| 907 |
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
| 908 |
|
|
|
|
| 920 |
}
|
| 921 |
}
|
| 922 |
|
| 923 |
+
auto it = buft_map.find({main_device, tensor_split_arr});
|
| 924 |
if (it != buft_map.end()) {
|
| 925 |
return &it->second;
|
| 926 |
}
|
| 927 |
+
auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
|
| 928 |
+
main_device,
|
| 929 |
+
tensor_split_arr,
|
| 930 |
+
GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
|
| 931 |
+
};
|
| 932 |
|
| 933 |
struct ggml_backend_buffer_type buft {
|
| 934 |
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
| 935 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
|
| 936 |
+
/* .context = */ ctx,
|
| 937 |
};
|
| 938 |
|
| 939 |
+
auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
|
| 940 |
return &result.first->second;
|
| 941 |
}
|
| 942 |
|
|
|
|
| 948 |
GGML_UNUSED(buft);
|
| 949 |
}
|
| 950 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 951 |
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 952 |
CUDA_CHECK(cudaFreeHost(buffer->context));
|
| 953 |
}
|
|
|
|
| 980 |
|
| 981 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 982 |
buffer->buft = buft;
|
|
|
|
| 983 |
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
| 984 |
|
| 985 |
return buffer;
|
|
|
|
| 1381 |
|
| 1382 |
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
| 1383 |
|
| 1384 |
+
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
| 1385 |
GGML_ASSERT(!(split && ne02 > 1));
|
| 1386 |
GGML_ASSERT(!(split && ne03 > 1));
|
| 1387 |
GGML_ASSERT(!(split && ne02 < ne12));
|
|
|
|
| 1871 |
}
|
| 1872 |
|
| 1873 |
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 1874 |
+
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
| 1875 |
|
| 1876 |
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
| 1877 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
|
|
| 1998 |
|
| 1999 |
GGML_TENSOR_BINARY_OP_LOCALS
|
| 2000 |
|
| 2001 |
+
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
|
| 2002 |
|
| 2003 |
cudaStream_t stream = ctx.stream();
|
| 2004 |
|
|
|
|
| 2131 |
|
| 2132 |
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
|
| 2133 |
// why is this here instead of mul_mat?
|
| 2134 |
+
if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
|
| 2135 |
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
|
| 2136 |
}
|
| 2137 |
|
|
|
|
| 2352 |
delete backend;
|
| 2353 |
}
|
| 2354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2355 |
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 2356 |
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2357 |
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
|
|
| 2557 |
continue;
|
| 2558 |
}
|
| 2559 |
|
| 2560 |
+
if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
|
| 2561 |
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
| 2562 |
#ifndef NDEBUG
|
| 2563 |
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
|
|
|
| 2644 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 2645 |
if (node->src[j] != nullptr) {
|
| 2646 |
assert(node->src[j]->buffer);
|
| 2647 |
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
|
| 2648 |
+
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
|
| 2649 |
}
|
| 2650 |
}
|
| 2651 |
#endif
|
|
|
|
| 2738 |
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
| 2739 |
if (stat == cudaErrorGraphExecUpdateFailure) {
|
| 2740 |
#ifndef NDEBUG
|
| 2741 |
+
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
|
| 2742 |
#endif
|
| 2743 |
// The pre-existing graph exec cannot be updated due to violated constraints
|
| 2744 |
// so instead clear error and re-instantiate
|
|
|
|
| 2787 |
static const ggml_backend_i ggml_backend_cuda_interface = {
|
| 2788 |
/* .get_name = */ ggml_backend_cuda_get_name,
|
| 2789 |
/* .free = */ ggml_backend_cuda_free,
|
|
|
|
| 2790 |
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
| 2791 |
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
| 2792 |
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
|
|
|
| 2796 |
/* .graph_plan_update = */ NULL,
|
| 2797 |
/* .graph_plan_compute = */ NULL,
|
| 2798 |
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
|
|
|
|
|
|
|
|
|
| 2799 |
/* .event_record = */ ggml_backend_cuda_event_record,
|
| 2800 |
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
| 2801 |
};
|
|
|
|
| 2885 |
|
| 2886 |
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
|
| 2887 |
GGML_UNUSED(dev);
|
| 2888 |
+
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
| 2889 |
}
|
| 2890 |
|
| 2891 |
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
|
|
|
| 2909 |
};
|
| 2910 |
}
|
| 2911 |
|
| 2912 |
+
static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
| 2913 |
GGML_UNUSED(params);
|
| 2914 |
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
| 2915 |
return ggml_backend_cuda_init(ctx->device);
|
|
|
|
| 2925 |
return ggml_backend_cuda_host_buffer_type();
|
| 2926 |
}
|
| 2927 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2928 |
// TODO: move these functions here
|
| 2929 |
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
| 2930 |
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
| 2931 |
|
| 2932 |
+
// split buffers can only be used with GGML_OP_MUL_MAT
|
| 2933 |
+
if (op->op != GGML_OP_MUL_MAT) {
|
| 2934 |
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 2935 |
+
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
|
| 2936 |
+
return false;
|
| 2937 |
+
}
|
| 2938 |
+
}
|
| 2939 |
+
}
|
| 2940 |
+
|
| 2941 |
+
// check if all the sources are allocated on this device
|
| 2942 |
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 2943 |
+
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
|
| 2944 |
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
|
| 2945 |
+
if (buft_ctx->device != dev_ctx->device) {
|
| 2946 |
+
return false;
|
| 2947 |
+
}
|
| 2948 |
+
}
|
| 2949 |
+
}
|
| 2950 |
+
|
| 2951 |
switch (op->op) {
|
| 2952 |
case GGML_OP_UNARY:
|
| 2953 |
switch (ggml_get_unary_op(op)) {
|
|
|
|
| 3173 |
}
|
| 3174 |
|
| 3175 |
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
| 3176 |
+
return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
|
| 3177 |
+
}
|
|
|
|
| 3178 |
|
| 3179 |
+
static int64_t get_op_batch_size(const ggml_tensor * op) {
|
| 3180 |
+
switch (op->op) {
|
| 3181 |
+
case GGML_OP_GET_ROWS:
|
| 3182 |
+
return 0;
|
| 3183 |
+
case GGML_OP_MUL_MAT:
|
| 3184 |
+
return op->ne[1];
|
| 3185 |
+
case GGML_OP_MUL_MAT_ID:
|
| 3186 |
+
case GGML_OP_ROPE:
|
| 3187 |
+
return op->ne[2];
|
| 3188 |
+
default:
|
| 3189 |
+
return ggml_nrows(op);
|
| 3190 |
}
|
|
|
|
|
|
|
| 3191 |
}
|
| 3192 |
|
| 3193 |
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
| 3194 |
const int min_batch_size = 32;
|
| 3195 |
|
| 3196 |
+
return get_op_batch_size(op) >= min_batch_size;
|
|
|
|
| 3197 |
|
| 3198 |
GGML_UNUSED(dev);
|
| 3199 |
}
|
|
|
|
| 3234 |
/* .get_memory = */ ggml_backend_cuda_device_get_memory,
|
| 3235 |
/* .get_type = */ ggml_backend_cuda_device_get_type,
|
| 3236 |
/* .get_props = */ ggml_backend_cuda_device_get_props,
|
| 3237 |
+
/* .init_backend = */ ggml_backend_cuda_device_init_backend,
|
| 3238 |
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
|
| 3239 |
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
|
| 3240 |
+
/* .buffer_from_host_ptr = */ NULL,
|
| 3241 |
/* .supports_op = */ ggml_backend_cuda_device_supports_op,
|
| 3242 |
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
|
| 3243 |
/* .offload_op = */ ggml_backend_cuda_device_offload_op,
|
ggml/src/ggml-kompute.cpp
CHANGED
|
@@ -1820,11 +1820,6 @@ static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
|
|
| 1820 |
}
|
| 1821 |
}
|
| 1822 |
|
| 1823 |
-
static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 1824 |
-
auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
|
| 1825 |
-
return ctx->name.c_str();
|
| 1826 |
-
}
|
| 1827 |
-
|
| 1828 |
static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1829 |
auto * memory = (ggml_vk_memory *)buffer->context;
|
| 1830 |
if (ggml_vk_has_device()) {
|
|
@@ -1868,7 +1863,6 @@ static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint
|
|
| 1868 |
}
|
| 1869 |
|
| 1870 |
static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
|
| 1871 |
-
/* .get_name = */ ggml_backend_kompute_buffer_get_name,
|
| 1872 |
/* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
|
| 1873 |
/* .get_base = */ ggml_backend_kompute_buffer_get_base,
|
| 1874 |
/* .init_tensor = */ NULL,
|
|
@@ -1953,11 +1947,6 @@ static void ggml_backend_kompute_free(ggml_backend_t backend) {
|
|
| 1953 |
delete backend;
|
| 1954 |
}
|
| 1955 |
|
| 1956 |
-
static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
|
| 1957 |
-
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
| 1958 |
-
return ggml_backend_kompute_buffer_type(ctx->device);
|
| 1959 |
-
}
|
| 1960 |
-
|
| 1961 |
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 1962 |
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
| 1963 |
ggml_vk_graph_compute(ctx, cgraph);
|
|
@@ -1977,7 +1966,6 @@ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_back
|
|
| 1977 |
static struct ggml_backend_i kompute_backend_i = {
|
| 1978 |
/* .get_name = */ ggml_backend_kompute_name,
|
| 1979 |
/* .free = */ ggml_backend_kompute_free,
|
| 1980 |
-
/* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
|
| 1981 |
/* .set_tensor_async = */ NULL,
|
| 1982 |
/* .get_tensor_async = */ NULL,
|
| 1983 |
/* .cpy_tensor_async = */ NULL,
|
|
@@ -1987,9 +1975,6 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
| 1987 |
/* .graph_plan_update = */ NULL,
|
| 1988 |
/* .graph_plan_compute = */ NULL,
|
| 1989 |
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
| 1990 |
-
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
| 1991 |
-
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
| 1992 |
-
/* .offload_op = */ NULL,
|
| 1993 |
/* .event_record = */ NULL,
|
| 1994 |
/* .event_wait = */ NULL,
|
| 1995 |
};
|
|
|
|
| 1820 |
}
|
| 1821 |
}
|
| 1822 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1823 |
static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1824 |
auto * memory = (ggml_vk_memory *)buffer->context;
|
| 1825 |
if (ggml_vk_has_device()) {
|
|
|
|
| 1863 |
}
|
| 1864 |
|
| 1865 |
static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
|
|
|
|
| 1866 |
/* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
|
| 1867 |
/* .get_base = */ ggml_backend_kompute_buffer_get_base,
|
| 1868 |
/* .init_tensor = */ NULL,
|
|
|
|
| 1947 |
delete backend;
|
| 1948 |
}
|
| 1949 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1950 |
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 1951 |
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
| 1952 |
ggml_vk_graph_compute(ctx, cgraph);
|
|
|
|
| 1966 |
static struct ggml_backend_i kompute_backend_i = {
|
| 1967 |
/* .get_name = */ ggml_backend_kompute_name,
|
| 1968 |
/* .free = */ ggml_backend_kompute_free,
|
|
|
|
| 1969 |
/* .set_tensor_async = */ NULL,
|
| 1970 |
/* .get_tensor_async = */ NULL,
|
| 1971 |
/* .cpy_tensor_async = */ NULL,
|
|
|
|
| 1975 |
/* .graph_plan_update = */ NULL,
|
| 1976 |
/* .graph_plan_compute = */ NULL,
|
| 1977 |
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
|
|
|
|
|
|
|
|
|
| 1978 |
/* .event_record = */ NULL,
|
| 1979 |
/* .event_wait = */ NULL,
|
| 1980 |
};
|
ggml/src/ggml-metal.m
CHANGED
|
@@ -3254,12 +3254,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
| 3254 |
|
| 3255 |
// backend interface
|
| 3256 |
|
| 3257 |
-
static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 3258 |
-
return "Metal";
|
| 3259 |
-
|
| 3260 |
-
UNUSED(buffer);
|
| 3261 |
-
}
|
| 3262 |
-
|
| 3263 |
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 3264 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 3265 |
|
|
@@ -3314,7 +3308,6 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
|
| 3314 |
}
|
| 3315 |
|
| 3316 |
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
| 3317 |
-
/* .get_name = */ ggml_backend_metal_buffer_get_name,
|
| 3318 |
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
| 3319 |
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
| 3320 |
/* .init_tensor = */ NULL,
|
|
@@ -3439,6 +3432,29 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
| 3439 |
return &ggml_backend_buffer_type_metal;
|
| 3440 |
}
|
| 3441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3442 |
// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
|
| 3443 |
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
| 3444 |
struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
|
|
@@ -3515,7 +3531,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
|
| 3515 |
}
|
| 3516 |
}
|
| 3517 |
|
| 3518 |
-
return ggml_backend_buffer_init(
|
| 3519 |
}
|
| 3520 |
|
| 3521 |
// backend
|
|
@@ -3536,12 +3552,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
|
| 3536 |
free(backend);
|
| 3537 |
}
|
| 3538 |
|
| 3539 |
-
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
| 3540 |
-
return ggml_backend_metal_buffer_type();
|
| 3541 |
-
|
| 3542 |
-
UNUSED(backend);
|
| 3543 |
-
}
|
| 3544 |
-
|
| 3545 |
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 3546 |
return ggml_metal_graph_compute(backend, cgraph);
|
| 3547 |
}
|
|
@@ -3608,7 +3618,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|
| 3608 |
static struct ggml_backend_i ggml_backend_metal_i = {
|
| 3609 |
/* .get_name = */ ggml_backend_metal_name,
|
| 3610 |
/* .free = */ ggml_backend_metal_free,
|
| 3611 |
-
/* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
|
| 3612 |
/* .set_tensor_async = */ NULL,
|
| 3613 |
/* .get_tensor_async = */ NULL,
|
| 3614 |
/* .cpy_tensor_async = */ NULL,
|
|
@@ -3618,9 +3627,6 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|
| 3618 |
/* .graph_plan_update = */ NULL,
|
| 3619 |
/* .graph_plan_compute = */ NULL,
|
| 3620 |
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
| 3621 |
-
/* .supports_op = */ NULL,
|
| 3622 |
-
/* .supports_buft = */ NULL,
|
| 3623 |
-
/* .offload_op = */ NULL,
|
| 3624 |
/* .event_record = */ NULL,
|
| 3625 |
/* .event_wait = */ NULL,
|
| 3626 |
};
|
|
@@ -3715,7 +3721,7 @@ static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t
|
|
| 3715 |
}
|
| 3716 |
|
| 3717 |
static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
|
| 3718 |
-
return
|
| 3719 |
|
| 3720 |
GGML_UNUSED(dev);
|
| 3721 |
}
|
|
|
|
| 3254 |
|
| 3255 |
// backend interface
|
| 3256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3257 |
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 3258 |
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 3259 |
|
|
|
|
| 3308 |
}
|
| 3309 |
|
| 3310 |
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
|
|
|
| 3311 |
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
| 3312 |
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
| 3313 |
/* .init_tensor = */ NULL,
|
|
|
|
| 3432 |
return &ggml_backend_buffer_type_metal;
|
| 3433 |
}
|
| 3434 |
|
| 3435 |
+
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 3436 |
+
return "Metal_Mapped";
|
| 3437 |
+
|
| 3438 |
+
UNUSED(buft);
|
| 3439 |
+
}
|
| 3440 |
+
|
| 3441 |
+
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
|
| 3442 |
+
static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
|
| 3443 |
+
/* .iface = */ {
|
| 3444 |
+
/* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
|
| 3445 |
+
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
| 3446 |
+
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
| 3447 |
+
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
| 3448 |
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 3449 |
+
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
| 3450 |
+
},
|
| 3451 |
+
/* .device = */ &g_ggml_backend_metal_device,
|
| 3452 |
+
/* .context = */ NULL,
|
| 3453 |
+
};
|
| 3454 |
+
|
| 3455 |
+
return &ggml_backend_buffer_from_ptr_type_metal;
|
| 3456 |
+
}
|
| 3457 |
+
|
| 3458 |
// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
|
| 3459 |
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
| 3460 |
struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
|
|
|
|
| 3531 |
}
|
| 3532 |
}
|
| 3533 |
|
| 3534 |
+
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
|
| 3535 |
}
|
| 3536 |
|
| 3537 |
// backend
|
|
|
|
| 3552 |
free(backend);
|
| 3553 |
}
|
| 3554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3555 |
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 3556 |
return ggml_metal_graph_compute(backend, cgraph);
|
| 3557 |
}
|
|
|
|
| 3618 |
static struct ggml_backend_i ggml_backend_metal_i = {
|
| 3619 |
/* .get_name = */ ggml_backend_metal_name,
|
| 3620 |
/* .free = */ ggml_backend_metal_free,
|
|
|
|
| 3621 |
/* .set_tensor_async = */ NULL,
|
| 3622 |
/* .get_tensor_async = */ NULL,
|
| 3623 |
/* .cpy_tensor_async = */ NULL,
|
|
|
|
| 3627 |
/* .graph_plan_update = */ NULL,
|
| 3628 |
/* .graph_plan_compute = */ NULL,
|
| 3629 |
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
|
|
|
|
|
|
|
|
|
| 3630 |
/* .event_record = */ NULL,
|
| 3631 |
/* .event_wait = */ NULL,
|
| 3632 |
};
|
|
|
|
| 3721 |
}
|
| 3722 |
|
| 3723 |
static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
|
| 3724 |
+
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
| 3725 |
|
| 3726 |
GGML_UNUSED(dev);
|
| 3727 |
}
|
ggml/src/ggml-rpc.cpp
CHANGED
|
@@ -178,7 +178,6 @@ struct ggml_backend_rpc_buffer_context {
|
|
| 178 |
std::shared_ptr<socket_t> sock;
|
| 179 |
std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
|
| 180 |
uint64_t remote_ptr;
|
| 181 |
-
std::string name;
|
| 182 |
};
|
| 183 |
|
| 184 |
// RPC helper functions
|
|
@@ -409,11 +408,6 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
|
| 409 |
return sock;
|
| 410 |
}
|
| 411 |
|
| 412 |
-
static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 413 |
-
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 414 |
-
return ctx->name.c_str();
|
| 415 |
-
}
|
| 416 |
-
|
| 417 |
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 418 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 419 |
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
|
|
@@ -524,7 +518,6 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|
| 524 |
}
|
| 525 |
|
| 526 |
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
| 527 |
-
/* .get_name = */ ggml_backend_rpc_buffer_get_name,
|
| 528 |
/* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer,
|
| 529 |
/* .get_base = */ ggml_backend_rpc_buffer_get_base,
|
| 530 |
/* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor,
|
|
@@ -551,7 +544,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
|
| 551 |
if (response.remote_ptr != 0) {
|
| 552 |
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
| 553 |
ggml_backend_rpc_buffer_interface,
|
| 554 |
-
new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr
|
| 555 |
response.remote_size);
|
| 556 |
return buffer;
|
| 557 |
} else {
|
|
@@ -609,11 +602,6 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
|
| 609 |
delete backend;
|
| 610 |
}
|
| 611 |
|
| 612 |
-
static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
| 613 |
-
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
| 614 |
-
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
| 615 |
-
}
|
| 616 |
-
|
| 617 |
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
| 618 |
UNUSED(backend);
|
| 619 |
// this is no-op because we don't have any async operations
|
|
@@ -670,7 +658,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
|
|
| 670 |
static ggml_backend_i ggml_backend_rpc_interface = {
|
| 671 |
/* .get_name = */ ggml_backend_rpc_name,
|
| 672 |
/* .free = */ ggml_backend_rpc_free,
|
| 673 |
-
/* .get_default_buffer_type = */ ggml_backend_rpc_get_default_buffer_type,
|
| 674 |
/* .set_tensor_async = */ NULL,
|
| 675 |
/* .get_tensor_async = */ NULL,
|
| 676 |
/* .cpy_tensor_async = */ NULL,
|
|
@@ -680,9 +667,6 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|
| 680 |
/* .graph_plan_update = */ NULL,
|
| 681 |
/* .graph_plan_compute = */ NULL,
|
| 682 |
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
| 683 |
-
/* .supports_op = */ NULL,
|
| 684 |
-
/* .supports_buft = */ NULL,
|
| 685 |
-
/* .offload_op = */ NULL,
|
| 686 |
/* .event_record = */ NULL,
|
| 687 |
/* .event_wait = */ NULL,
|
| 688 |
};
|
|
@@ -1278,7 +1262,7 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|
| 1278 |
|
| 1279 |
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
| 1280 |
// TODO: obtain value from the server
|
| 1281 |
-
return
|
| 1282 |
|
| 1283 |
UNUSED(dev);
|
| 1284 |
}
|
|
|
|
| 178 |
std::shared_ptr<socket_t> sock;
|
| 179 |
std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
|
| 180 |
uint64_t remote_ptr;
|
|
|
|
| 181 |
};
|
| 182 |
|
| 183 |
// RPC helper functions
|
|
|
|
| 408 |
return sock;
|
| 409 |
}
|
| 410 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 412 |
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
| 413 |
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
|
|
|
|
| 518 |
}
|
| 519 |
|
| 520 |
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
|
|
|
| 521 |
/* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer,
|
| 522 |
/* .get_base = */ ggml_backend_rpc_buffer_get_base,
|
| 523 |
/* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor,
|
|
|
|
| 544 |
if (response.remote_ptr != 0) {
|
| 545 |
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
| 546 |
ggml_backend_rpc_buffer_interface,
|
| 547 |
+
new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
|
| 548 |
response.remote_size);
|
| 549 |
return buffer;
|
| 550 |
} else {
|
|
|
|
| 602 |
delete backend;
|
| 603 |
}
|
| 604 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
| 606 |
UNUSED(backend);
|
| 607 |
// this is no-op because we don't have any async operations
|
|
|
|
| 658 |
static ggml_backend_i ggml_backend_rpc_interface = {
|
| 659 |
/* .get_name = */ ggml_backend_rpc_name,
|
| 660 |
/* .free = */ ggml_backend_rpc_free,
|
|
|
|
| 661 |
/* .set_tensor_async = */ NULL,
|
| 662 |
/* .get_tensor_async = */ NULL,
|
| 663 |
/* .cpy_tensor_async = */ NULL,
|
|
|
|
| 667 |
/* .graph_plan_update = */ NULL,
|
| 668 |
/* .graph_plan_compute = */ NULL,
|
| 669 |
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
|
|
|
|
|
|
|
|
|
| 670 |
/* .event_record = */ NULL,
|
| 671 |
/* .event_wait = */ NULL,
|
| 672 |
};
|
|
|
|
| 1262 |
|
| 1263 |
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
| 1264 |
// TODO: obtain value from the server
|
| 1265 |
+
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
| 1266 |
|
| 1267 |
UNUSED(dev);
|
| 1268 |
}
|
ggml/src/ggml-sycl.cpp
CHANGED
|
@@ -249,13 +249,10 @@ struct ggml_backend_sycl_buffer_context {
|
|
| 249 |
}
|
| 250 |
};
|
| 251 |
|
| 252 |
-
static const char *
|
| 253 |
-
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
| 254 |
-
return ctx->name.c_str();
|
| 255 |
-
}
|
| 256 |
|
| 257 |
static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
|
| 258 |
-
return buffer->iface.get_name ==
|
| 259 |
}
|
| 260 |
|
| 261 |
static void
|
|
@@ -440,7 +437,6 @@ catch (sycl::exception const &exc) {
|
|
| 440 |
}
|
| 441 |
|
| 442 |
static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
| 443 |
-
/* .get_name = */ ggml_backend_sycl_buffer_get_name,
|
| 444 |
/* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
|
| 445 |
/* .get_base = */ ggml_backend_sycl_buffer_get_base,
|
| 446 |
/* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor,
|
|
@@ -698,16 +694,6 @@ struct ggml_backend_sycl_split_buffer_context {
|
|
| 698 |
std::vector<queue_ptr> streams;
|
| 699 |
};
|
| 700 |
|
| 701 |
-
static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 702 |
-
return GGML_SYCL_NAME "_Split";
|
| 703 |
-
|
| 704 |
-
GGML_UNUSED(buffer);
|
| 705 |
-
}
|
| 706 |
-
|
| 707 |
-
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
| 708 |
-
return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
| 709 |
-
}
|
| 710 |
-
|
| 711 |
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 712 |
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
| 713 |
delete ctx;
|
|
@@ -915,7 +901,6 @@ static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, u
|
|
| 915 |
}
|
| 916 |
|
| 917 |
static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
|
| 918 |
-
/* .get_name = */ ggml_backend_sycl_split_buffer_get_name,
|
| 919 |
/* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer,
|
| 920 |
/* .get_base = */ ggml_backend_sycl_split_buffer_get_base,
|
| 921 |
/* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor,
|
|
@@ -935,6 +920,10 @@ static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_bu
|
|
| 935 |
GGML_UNUSED(buft);
|
| 936 |
}
|
| 937 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 938 |
static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 939 |
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 940 |
// instead, we allocate them for each tensor separately in init_tensor
|
|
@@ -1040,12 +1029,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
|
| 1040 |
GGML_UNUSED(buft);
|
| 1041 |
}
|
| 1042 |
|
| 1043 |
-
static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
|
| 1044 |
-
return GGML_SYCL_NAME "_Host";
|
| 1045 |
-
|
| 1046 |
-
GGML_UNUSED(buffer);
|
| 1047 |
-
}
|
| 1048 |
-
|
| 1049 |
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1050 |
ggml_sycl_host_free(buffer->context);
|
| 1051 |
}
|
|
@@ -1061,7 +1044,6 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
|
|
| 1061 |
// FIXME: this is a hack to avoid having to implement a new buffer type
|
| 1062 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 1063 |
buffer->buft = buft;
|
| 1064 |
-
buffer->iface.get_name = ggml_backend_sycl_host_buffer_name;
|
| 1065 |
buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
|
| 1066 |
|
| 1067 |
return buffer;
|
|
@@ -4889,12 +4871,6 @@ static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
|
| 4889 |
delete backend;
|
| 4890 |
}
|
| 4891 |
|
| 4892 |
-
|
| 4893 |
-
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
| 4894 |
-
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 4895 |
-
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
| 4896 |
-
}
|
| 4897 |
-
|
| 4898 |
static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
| 4899 |
ggml_tensor *tensor,
|
| 4900 |
const void *data, size_t offset,
|
|
@@ -5031,7 +5007,6 @@ static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_ev
|
|
| 5031 |
static ggml_backend_i ggml_backend_sycl_interface = {
|
| 5032 |
/* .get_name = */ ggml_backend_sycl_get_name,
|
| 5033 |
/* .free = */ ggml_backend_sycl_free,
|
| 5034 |
-
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
|
| 5035 |
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
| 5036 |
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
| 5037 |
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
|
|
@@ -5043,9 +5018,6 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
| 5043 |
/* .graph_plan_update = */ NULL,
|
| 5044 |
/* .graph_plan_compute = */ NULL,
|
| 5045 |
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
| 5046 |
-
/* .supports_op = */ NULL, // moved to device
|
| 5047 |
-
/* .supports_buft = */ NULL, // moved to device
|
| 5048 |
-
/* .offload_op = */ NULL, // moved to device
|
| 5049 |
/* .event_record = */ ggml_backend_sycl_event_record,
|
| 5050 |
/* .event_wait = */ ggml_backend_sycl_event_wait,
|
| 5051 |
};
|
|
@@ -5092,7 +5064,7 @@ static void ggml_backend_sycl_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|
| 5092 |
|
| 5093 |
static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
|
| 5094 |
GGML_UNUSED(dev);
|
| 5095 |
-
return
|
| 5096 |
}
|
| 5097 |
|
| 5098 |
static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
|
@@ -5388,12 +5360,14 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
|
|
| 5388 |
return ctx->devices[index];
|
| 5389 |
}
|
| 5390 |
|
| 5391 |
-
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name)
|
| 5392 |
-
{
|
| 5393 |
GGML_UNUSED(reg);
|
| 5394 |
-
|
| 5395 |
-
|
| 5396 |
-
|
|
|
|
|
|
|
|
|
|
| 5397 |
// SYCL doesn't support registering host memory, left here for reference
|
| 5398 |
// "ggml_backend_register_host_buffer"
|
| 5399 |
// "ggml_backend_unregister_host_buffer"
|
|
|
|
| 249 |
}
|
| 250 |
};
|
| 251 |
|
| 252 |
+
static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft);
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
|
| 255 |
+
return buffer->buft->iface.get_name == ggml_backend_sycl_buffer_type_get_name;
|
| 256 |
}
|
| 257 |
|
| 258 |
static void
|
|
|
|
| 437 |
}
|
| 438 |
|
| 439 |
static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
|
|
|
| 440 |
/* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
|
| 441 |
/* .get_base = */ ggml_backend_sycl_buffer_get_base,
|
| 442 |
/* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor,
|
|
|
|
| 694 |
std::vector<queue_ptr> streams;
|
| 695 |
};
|
| 696 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 697 |
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 698 |
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
| 699 |
delete ctx;
|
|
|
|
| 901 |
}
|
| 902 |
|
| 903 |
static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
|
|
|
|
| 904 |
/* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer,
|
| 905 |
/* .get_base = */ ggml_backend_sycl_split_buffer_get_base,
|
| 906 |
/* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor,
|
|
|
|
| 920 |
GGML_UNUSED(buft);
|
| 921 |
}
|
| 922 |
|
| 923 |
+
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
| 924 |
+
return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name;
|
| 925 |
+
}
|
| 926 |
+
|
| 927 |
static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 928 |
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 929 |
// instead, we allocate them for each tensor separately in init_tensor
|
|
|
|
| 1029 |
GGML_UNUSED(buft);
|
| 1030 |
}
|
| 1031 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1032 |
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1033 |
ggml_sycl_host_free(buffer->context);
|
| 1034 |
}
|
|
|
|
| 1044 |
// FIXME: this is a hack to avoid having to implement a new buffer type
|
| 1045 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 1046 |
buffer->buft = buft;
|
|
|
|
| 1047 |
buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
|
| 1048 |
|
| 1049 |
return buffer;
|
|
|
|
| 4871 |
delete backend;
|
| 4872 |
}
|
| 4873 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4874 |
static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
| 4875 |
ggml_tensor *tensor,
|
| 4876 |
const void *data, size_t offset,
|
|
|
|
| 5007 |
static ggml_backend_i ggml_backend_sycl_interface = {
|
| 5008 |
/* .get_name = */ ggml_backend_sycl_get_name,
|
| 5009 |
/* .free = */ ggml_backend_sycl_free,
|
|
|
|
| 5010 |
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
| 5011 |
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
| 5012 |
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
|
|
|
|
| 5018 |
/* .graph_plan_update = */ NULL,
|
| 5019 |
/* .graph_plan_compute = */ NULL,
|
| 5020 |
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
|
|
|
|
|
|
|
|
|
| 5021 |
/* .event_record = */ ggml_backend_sycl_event_record,
|
| 5022 |
/* .event_wait = */ ggml_backend_sycl_event_wait,
|
| 5023 |
};
|
|
|
|
| 5064 |
|
| 5065 |
static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
|
| 5066 |
GGML_UNUSED(dev);
|
| 5067 |
+
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
| 5068 |
}
|
| 5069 |
|
| 5070 |
static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
|
|
|
| 5360 |
return ctx->devices[index];
|
| 5361 |
}
|
| 5362 |
|
| 5363 |
+
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
|
|
|
|
| 5364 |
GGML_UNUSED(reg);
|
| 5365 |
+
|
| 5366 |
+
// TODO: update to the current function signature
|
| 5367 |
+
//if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
| 5368 |
+
// return (void *)ggml_backend_sycl_split_buffer_type;
|
| 5369 |
+
//}
|
| 5370 |
+
|
| 5371 |
// SYCL doesn't support registering host memory, left here for reference
|
| 5372 |
// "ggml_backend_register_host_buffer"
|
| 5373 |
// "ggml_backend_unregister_host_buffer"
|
ggml/src/ggml-vulkan.cpp
CHANGED
|
@@ -6247,13 +6247,8 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
|
| 6247 |
|
| 6248 |
// device backend
|
| 6249 |
|
| 6250 |
-
static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 6251 |
-
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 6252 |
-
return ctx->name.c_str();
|
| 6253 |
-
}
|
| 6254 |
-
|
| 6255 |
static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
| 6256 |
-
return buffer->iface.get_name ==
|
| 6257 |
}
|
| 6258 |
|
| 6259 |
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
@@ -6317,7 +6312,6 @@ static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t v
|
|
| 6317 |
}
|
| 6318 |
|
| 6319 |
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
| 6320 |
-
/* .get_name = */ ggml_backend_vk_buffer_get_name,
|
| 6321 |
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
|
| 6322 |
/* .get_base = */ ggml_backend_vk_buffer_get_base,
|
| 6323 |
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
|
|
@@ -6413,7 +6407,6 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
|
|
| 6413 |
|
| 6414 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 6415 |
buffer->buft = buft;
|
| 6416 |
-
buffer->iface.get_name = ggml_backend_vk_host_buffer_name;
|
| 6417 |
buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
|
| 6418 |
|
| 6419 |
return buffer;
|
|
@@ -6646,7 +6639,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
| 6646 |
static ggml_backend_i ggml_backend_vk_interface = {
|
| 6647 |
/* .get_name = */ ggml_backend_vk_name,
|
| 6648 |
/* .free = */ ggml_backend_vk_free,
|
| 6649 |
-
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
| 6650 |
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
| 6651 |
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
| 6652 |
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
|
@@ -6656,9 +6648,6 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
| 6656 |
/* .graph_plan_update = */ NULL,
|
| 6657 |
/* .graph_plan_compute = */ NULL,
|
| 6658 |
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
| 6659 |
-
/* .supports_op = */ NULL,
|
| 6660 |
-
/* .supports_buft = */ NULL,
|
| 6661 |
-
/* .offload_op = */ NULL,
|
| 6662 |
/* .event_record = */ NULL,
|
| 6663 |
/* .event_wait = */ NULL,
|
| 6664 |
};
|
|
@@ -6717,7 +6706,7 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
|
|
| 6717 |
//////////////////////////
|
| 6718 |
|
| 6719 |
struct ggml_backend_vk_device_context {
|
| 6720 |
-
|
| 6721 |
std::string name;
|
| 6722 |
std::string description;
|
| 6723 |
};
|
|
@@ -6749,7 +6738,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
|
|
| 6749 |
|
| 6750 |
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
| 6751 |
UNUSED(dev);
|
| 6752 |
-
return
|
| 6753 |
}
|
| 6754 |
|
| 6755 |
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
|
@@ -6758,9 +6747,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
|
| 6758 |
props->type = ggml_backend_vk_device_get_type(dev);
|
| 6759 |
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
| 6760 |
props->caps = {
|
| 6761 |
-
/* async
|
| 6762 |
-
/* host_buffer */ true,
|
| 6763 |
-
/*
|
|
|
|
| 6764 |
};
|
| 6765 |
}
|
| 6766 |
|
|
@@ -6949,7 +6939,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
|
| 6949 |
static std::mutex mutex;
|
| 6950 |
std::lock_guard<std::mutex> lock(mutex);
|
| 6951 |
if (!initialized) {
|
| 6952 |
-
for (
|
| 6953 |
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
| 6954 |
char desc[256];
|
| 6955 |
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
|
|
|
|
| 6247 |
|
| 6248 |
// device backend
|
| 6249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6250 |
static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
| 6251 |
+
return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name;
|
| 6252 |
}
|
| 6253 |
|
| 6254 |
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
|
|
| 6312 |
}
|
| 6313 |
|
| 6314 |
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
|
|
|
| 6315 |
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
|
| 6316 |
/* .get_base = */ ggml_backend_vk_buffer_get_base,
|
| 6317 |
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
|
|
|
|
| 6407 |
|
| 6408 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 6409 |
buffer->buft = buft;
|
|
|
|
| 6410 |
buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
|
| 6411 |
|
| 6412 |
return buffer;
|
|
|
|
| 6639 |
static ggml_backend_i ggml_backend_vk_interface = {
|
| 6640 |
/* .get_name = */ ggml_backend_vk_name,
|
| 6641 |
/* .free = */ ggml_backend_vk_free,
|
|
|
|
| 6642 |
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
| 6643 |
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
| 6644 |
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
|
|
|
| 6648 |
/* .graph_plan_update = */ NULL,
|
| 6649 |
/* .graph_plan_compute = */ NULL,
|
| 6650 |
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
|
|
|
|
|
|
|
|
|
| 6651 |
/* .event_record = */ NULL,
|
| 6652 |
/* .event_wait = */ NULL,
|
| 6653 |
};
|
|
|
|
| 6706 |
//////////////////////////
|
| 6707 |
|
| 6708 |
struct ggml_backend_vk_device_context {
|
| 6709 |
+
size_t device;
|
| 6710 |
std::string name;
|
| 6711 |
std::string description;
|
| 6712 |
};
|
|
|
|
| 6738 |
|
| 6739 |
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
| 6740 |
UNUSED(dev);
|
| 6741 |
+
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
| 6742 |
}
|
| 6743 |
|
| 6744 |
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
|
|
|
| 6747 |
props->type = ggml_backend_vk_device_get_type(dev);
|
| 6748 |
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
| 6749 |
props->caps = {
|
| 6750 |
+
/* .async = */ false,
|
| 6751 |
+
/* .host_buffer = */ true,
|
| 6752 |
+
/* .buffer_from_host_ptr = */ false,
|
| 6753 |
+
/* .events = */ false,
|
| 6754 |
};
|
| 6755 |
}
|
| 6756 |
|
|
|
|
| 6939 |
static std::mutex mutex;
|
| 6940 |
std::lock_guard<std::mutex> lock(mutex);
|
| 6941 |
if (!initialized) {
|
| 6942 |
+
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
| 6943 |
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
| 6944 |
char desc[256];
|
| 6945 |
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
|
ggml/src/ggml.c
CHANGED
|
@@ -3999,7 +3999,9 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
|
| 3999 |
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
| 4000 |
GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
| 4001 |
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
|
| 4002 |
-
|
|
|
|
|
|
|
| 4003 |
return NULL;
|
| 4004 |
}
|
| 4005 |
|
|
|
|
| 3999 |
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
| 4000 |
GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
| 4001 |
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
|
| 4002 |
+
#ifndef NDEBUG
|
| 4003 |
+
GGML_ABORT("not enough space in the context's memory pool");
|
| 4004 |
+
#endif
|
| 4005 |
return NULL;
|
| 4006 |
}
|
| 4007 |
|