agray3 commited on
Commit
143f6df
·
1 Parent(s): 0be4f48

Allow multiple copy function pointers for CUDA graph kernel param updates (llama/7565)

Browse files

CUDA graphs require parameter updates to kernels associated with
GGML_OP_CPY nodes. Previously the implementation only checked for a
single CUDA kernel in such nodes, but this caused a bug in cases where
2 such kernels exist. This fixes the issue by using a vector to allow
multiple function pointers to be stored and checked against.

Fixes #7942

Files changed (1) hide show
  1. ggml-cuda.cu +7 -6
ggml-cuda.cu CHANGED
@@ -2510,9 +2510,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2510
 
2511
  bool use_cuda_graph = true;
2512
  bool cuda_graph_update_required = false;
2513
- // pointer to CUDA cpy kernel, which is required to identify
2514
  // kernel parameters which need updated in the graph for each token
2515
- void * ggml_cuda_cpy_fn_ptr = nullptr;
2516
 
2517
  if (cuda_ctx->cuda_graph->graph == nullptr) {
2518
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
@@ -2588,9 +2588,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2588
  if (node->op == GGML_OP_CPY) {
2589
  // store the copy op parameter which changes with each token.
2590
  cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2591
- if (ggml_cuda_cpy_fn_ptr == nullptr) {
2592
- // store a pointer to the copy op CUDA kernel to identify it later
2593
- ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
 
2594
  }
2595
  }
2596
 
@@ -2720,7 +2721,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2720
  if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2721
  int k = 0;
2722
  for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2723
- if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
2724
  char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2725
  cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2726
  CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
 
2510
 
2511
  bool use_cuda_graph = true;
2512
  bool cuda_graph_update_required = false;
2513
+ // vector of pointers to CUDA cpy kernels, which are required to identify
2514
  // kernel parameters which need updated in the graph for each token
2515
+ std::vector<void *> ggml_cuda_cpy_fn_ptrs;
2516
 
2517
  if (cuda_ctx->cuda_graph->graph == nullptr) {
2518
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
 
2588
  if (node->op == GGML_OP_CPY) {
2589
  // store the copy op parameter which changes with each token.
2590
  cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2591
+ // store a pointer to each copy op CUDA kernel to identify it later
2592
+ void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2593
+ if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
2594
+ ggml_cuda_cpy_fn_ptrs.push_back(ptr);
2595
  }
2596
  }
2597
 
 
2721
  if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2722
  int k = 0;
2723
  for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2724
+ if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
2725
  char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2726
  cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2727
  CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));