rgerganov slaren commited on
Commit
eab8082
·
1 Parent(s): 7e5d850

llama : offload to RPC in addition to other backends (llama/7640)

Browse files

* llama : offload to RPC in addition to other backends

* - fix copy_tensor being called on the src buffer instead of the dst buffer

- always initialize views in the view_src buffer

- add RPC backend to Makefile build

- add endpoint to all RPC object names

* add rpc-server to Makefile

* Update llama.cpp

Co-authored-by: slaren <[email protected]>

---------

Co-authored-by: slaren <[email protected]>

Files changed (4) hide show
  1. ggml-alloc.c +3 -3
  2. ggml-backend.c +5 -5
  3. ggml-backend.h +1 -1
  4. ggml-rpc.cpp +2 -2
ggml-alloc.c CHANGED
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
750
  // this tensor was allocated without ggml-backend
751
  return;
752
  }
753
- ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
754
  }
755
  } else {
756
  if (tensor->data == NULL) {
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
899
  if (t->view_src == NULL) {
900
  ggml_tallocr_alloc(&tallocr, t);
901
  } else if (t->buffer == NULL) {
902
- ggml_backend_view_init(buffer, t);
903
  }
904
  } else {
905
  if (t->view_src != NULL && t->buffer == NULL) {
906
  // view of a pre-allocated tensor
907
- ggml_backend_view_init(buffer, t);
908
  }
909
  }
910
  }
 
750
  // this tensor was allocated without ggml-backend
751
  return;
752
  }
753
+ ggml_backend_view_init(tensor);
754
  }
755
  } else {
756
  if (tensor->data == NULL) {
 
899
  if (t->view_src == NULL) {
900
  ggml_tallocr_alloc(&tallocr, t);
901
  } else if (t->buffer == NULL) {
902
+ ggml_backend_view_init(t);
903
  }
904
  } else {
905
  if (t->view_src != NULL && t->buffer == NULL) {
906
  // view of a pre-allocated tensor
907
+ ggml_backend_view_init(t);
908
  }
909
  }
910
  }
ggml-backend.c CHANGED
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
151
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
152
  ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
  if (dst_buf->iface.cpy_tensor) {
154
- return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
155
  }
156
  return false;
157
  }
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
1887
 
1888
  // utils
1889
 
1890
- void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1891
  GGML_ASSERT(tensor->buffer == NULL);
1892
  GGML_ASSERT(tensor->view_src != NULL);
1893
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1894
  GGML_ASSERT(tensor->view_src->data != NULL);
1895
 
1896
- tensor->buffer = buffer;
1897
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898
- ggml_backend_buffer_init_tensor(buffer, tensor);
1899
  }
1900
 
1901
  void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1954
  struct ggml_tensor * dst = node_copies[id];
1955
  if (dst->view_src != NULL) {
1956
  graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1957
- ggml_backend_view_init(dst->view_src->buffer, dst);
1958
  }
1959
  else {
1960
  ggml_backend_tensor_copy(src, dst);
 
151
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
152
  ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
  if (dst_buf->iface.cpy_tensor) {
154
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
155
  }
156
  return false;
157
  }
 
1887
 
1888
  // utils
1889
 
1890
+ void ggml_backend_view_init(struct ggml_tensor * tensor) {
1891
  GGML_ASSERT(tensor->buffer == NULL);
1892
  GGML_ASSERT(tensor->view_src != NULL);
1893
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1894
  GGML_ASSERT(tensor->view_src->data != NULL);
1895
 
1896
+ tensor->buffer = tensor->view_src->buffer;
1897
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898
+ ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1899
  }
1900
 
1901
  void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
 
1954
  struct ggml_tensor * dst = node_copies[id];
1955
  if (dst->view_src != NULL) {
1956
  graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1957
+ ggml_backend_view_init(dst);
1958
  }
1959
  else {
1960
  ggml_backend_tensor_copy(src, dst);
ggml-backend.h CHANGED
@@ -225,7 +225,7 @@ extern "C" {
225
 
226
  // Tensor initialization
227
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
228
- GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
229
 
230
 
231
  #ifdef __cplusplus
 
225
 
226
  // Tensor initialization
227
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
228
+ GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
229
 
230
 
231
  #ifdef __cplusplus
ggml-rpc.cpp CHANGED
@@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
491
  if (remote_ptr != 0) {
492
  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
493
  ggml_backend_rpc_buffer_interface,
494
- new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
495
  remote_size);
496
  return buffer;
497
  } else {
@@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
692
  GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
693
  ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
694
  /* .endpoint = */ endpoint,
695
- /* .name = */ "RPC",
696
  };
697
 
698
  ggml_backend_t backend = new ggml_backend {
 
491
  if (remote_ptr != 0) {
492
  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
493
  ggml_backend_rpc_buffer_interface,
494
+ new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
495
  remote_size);
496
  return buffer;
497
  } else {
 
692
  GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
693
  ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
694
  /* .endpoint = */ endpoint,
695
+ /* .name = */ "RPC[" + std::string(endpoint) + "]",
696
  };
697
 
698
  ggml_backend_t backend = new ggml_backend {