Spaces:
Running
Running
llama : offload to RPC in addition to other backends (llama/7640)
Browse files* llama : offload to RPC in addition to other backends
* - fix copy_tensor being called on the src buffer instead of the dst buffer
- always initialize views in the view_src buffer
- add RPC backend to Makefile build
- add endpoint to all RPC object names
* add rpc-server to Makefile
* Update llama.cpp
Co-authored-by: slaren <[email protected]>
---------
Co-authored-by: slaren <[email protected]>
- ggml-alloc.c +3 -3
- ggml-backend.c +5 -5
- ggml-backend.h +1 -1
- ggml-rpc.cpp +2 -2
ggml-alloc.c
CHANGED
|
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
| 750 |
// this tensor was allocated without ggml-backend
|
| 751 |
return;
|
| 752 |
}
|
| 753 |
-
ggml_backend_view_init(
|
| 754 |
}
|
| 755 |
} else {
|
| 756 |
if (tensor->data == NULL) {
|
|
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
| 899 |
if (t->view_src == NULL) {
|
| 900 |
ggml_tallocr_alloc(&tallocr, t);
|
| 901 |
} else if (t->buffer == NULL) {
|
| 902 |
-
ggml_backend_view_init(
|
| 903 |
}
|
| 904 |
} else {
|
| 905 |
if (t->view_src != NULL && t->buffer == NULL) {
|
| 906 |
// view of a pre-allocated tensor
|
| 907 |
-
ggml_backend_view_init(
|
| 908 |
}
|
| 909 |
}
|
| 910 |
}
|
|
|
|
| 750 |
// this tensor was allocated without ggml-backend
|
| 751 |
return;
|
| 752 |
}
|
| 753 |
+
ggml_backend_view_init(tensor);
|
| 754 |
}
|
| 755 |
} else {
|
| 756 |
if (tensor->data == NULL) {
|
|
|
|
| 899 |
if (t->view_src == NULL) {
|
| 900 |
ggml_tallocr_alloc(&tallocr, t);
|
| 901 |
} else if (t->buffer == NULL) {
|
| 902 |
+
ggml_backend_view_init(t);
|
| 903 |
}
|
| 904 |
} else {
|
| 905 |
if (t->view_src != NULL && t->buffer == NULL) {
|
| 906 |
// view of a pre-allocated tensor
|
| 907 |
+
ggml_backend_view_init(t);
|
| 908 |
}
|
| 909 |
}
|
| 910 |
}
|
ggml-backend.c
CHANGED
|
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
|
| 151 |
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 152 |
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
| 153 |
if (dst_buf->iface.cpy_tensor) {
|
| 154 |
-
return
|
| 155 |
}
|
| 156 |
return false;
|
| 157 |
}
|
|
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
|
| 1887 |
|
| 1888 |
// utils
|
| 1889 |
|
| 1890 |
-
void ggml_backend_view_init(
|
| 1891 |
GGML_ASSERT(tensor->buffer == NULL);
|
| 1892 |
GGML_ASSERT(tensor->view_src != NULL);
|
| 1893 |
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
| 1894 |
GGML_ASSERT(tensor->view_src->data != NULL);
|
| 1895 |
|
| 1896 |
-
tensor->buffer = buffer;
|
| 1897 |
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
| 1898 |
-
ggml_backend_buffer_init_tensor(buffer, tensor);
|
| 1899 |
}
|
| 1900 |
|
| 1901 |
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
|
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
| 1954 |
struct ggml_tensor * dst = node_copies[id];
|
| 1955 |
if (dst->view_src != NULL) {
|
| 1956 |
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
| 1957 |
-
ggml_backend_view_init(dst
|
| 1958 |
}
|
| 1959 |
else {
|
| 1960 |
ggml_backend_tensor_copy(src, dst);
|
|
|
|
| 151 |
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 152 |
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
| 153 |
if (dst_buf->iface.cpy_tensor) {
|
| 154 |
+
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
| 155 |
}
|
| 156 |
return false;
|
| 157 |
}
|
|
|
|
| 1887 |
|
| 1888 |
// utils
|
| 1889 |
|
| 1890 |
+
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
| 1891 |
GGML_ASSERT(tensor->buffer == NULL);
|
| 1892 |
GGML_ASSERT(tensor->view_src != NULL);
|
| 1893 |
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
| 1894 |
GGML_ASSERT(tensor->view_src->data != NULL);
|
| 1895 |
|
| 1896 |
+
tensor->buffer = tensor->view_src->buffer;
|
| 1897 |
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
| 1898 |
+
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
| 1899 |
}
|
| 1900 |
|
| 1901 |
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
|
|
|
| 1954 |
struct ggml_tensor * dst = node_copies[id];
|
| 1955 |
if (dst->view_src != NULL) {
|
| 1956 |
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
| 1957 |
+
ggml_backend_view_init(dst);
|
| 1958 |
}
|
| 1959 |
else {
|
| 1960 |
ggml_backend_tensor_copy(src, dst);
|
ggml-backend.h
CHANGED
|
@@ -225,7 +225,7 @@ extern "C" {
|
|
| 225 |
|
| 226 |
// Tensor initialization
|
| 227 |
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
| 228 |
-
GGML_API void ggml_backend_view_init(
|
| 229 |
|
| 230 |
|
| 231 |
#ifdef __cplusplus
|
|
|
|
| 225 |
|
| 226 |
// Tensor initialization
|
| 227 |
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
| 228 |
+
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
| 229 |
|
| 230 |
|
| 231 |
#ifdef __cplusplus
|
ggml-rpc.cpp
CHANGED
|
@@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
|
| 491 |
if (remote_ptr != 0) {
|
| 492 |
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
| 493 |
ggml_backend_rpc_buffer_interface,
|
| 494 |
-
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
|
| 495 |
remote_size);
|
| 496 |
return buffer;
|
| 497 |
} else {
|
|
@@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
|
|
| 692 |
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
| 693 |
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
| 694 |
/* .endpoint = */ endpoint,
|
| 695 |
-
/* .name = */ "RPC",
|
| 696 |
};
|
| 697 |
|
| 698 |
ggml_backend_t backend = new ggml_backend {
|
|
|
|
| 491 |
if (remote_ptr != 0) {
|
| 492 |
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
| 493 |
ggml_backend_rpc_buffer_interface,
|
| 494 |
+
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
|
| 495 |
remote_size);
|
| 496 |
return buffer;
|
| 497 |
} else {
|
|
|
|
| 692 |
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
| 693 |
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
| 694 |
/* .endpoint = */ endpoint,
|
| 695 |
+
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
| 696 |
};
|
| 697 |
|
| 698 |
ggml_backend_t backend = new ggml_backend {
|