Spaces:
Sleeping
Sleeping
agray3
commited on
Commit
·
143f6df
1
Parent(s):
0be4f48
Allow multiple copy function pointers for CUDA graph kernel param updates (llama/7565)
Browse filesCUDA graphs require parameter updates to kernels associated with
GGML_OP_CPY nodes. Previously the implementation only checked for a
single CUDA kernel in such nodes, but this caused a bug in cases where
2 such kernels exist. This fixes the issue by using a vector to allow
multiple function pointers to be stored and checked against.
Fixes #7942
- ggml-cuda.cu +7 -6
ggml-cuda.cu
CHANGED
|
@@ -2510,9 +2510,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2510 |
|
| 2511 |
bool use_cuda_graph = true;
|
| 2512 |
bool cuda_graph_update_required = false;
|
| 2513 |
-
//
|
| 2514 |
// kernel parameters which need updated in the graph for each token
|
| 2515 |
-
void
|
| 2516 |
|
| 2517 |
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
| 2518 |
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
|
@@ -2588,9 +2588,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2588 |
if (node->op == GGML_OP_CPY) {
|
| 2589 |
// store the copy op parameter which changes with each token.
|
| 2590 |
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
| 2591 |
-
|
| 2592 |
-
|
| 2593 |
-
|
|
|
|
| 2594 |
}
|
| 2595 |
}
|
| 2596 |
|
|
@@ -2720,7 +2721,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2720 |
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
| 2721 |
int k = 0;
|
| 2722 |
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
| 2723 |
-
if (cuda_ctx->cuda_graph->params[i].func
|
| 2724 |
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
| 2725 |
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
| 2726 |
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
|
|
|
| 2510 |
|
| 2511 |
bool use_cuda_graph = true;
|
| 2512 |
bool cuda_graph_update_required = false;
|
| 2513 |
+
// vector of pointers to CUDA cpy kernels, which are required to identify
|
| 2514 |
// kernel parameters which need updated in the graph for each token
|
| 2515 |
+
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
| 2516 |
|
| 2517 |
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
| 2518 |
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
|
|
|
| 2588 |
if (node->op == GGML_OP_CPY) {
|
| 2589 |
// store the copy op parameter which changes with each token.
|
| 2590 |
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
| 2591 |
+
// store a pointer to each copy op CUDA kernel to identify it later
|
| 2592 |
+
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
| 2593 |
+
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
| 2594 |
+
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
| 2595 |
}
|
| 2596 |
}
|
| 2597 |
|
|
|
|
| 2721 |
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
| 2722 |
int k = 0;
|
| 2723 |
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
| 2724 |
+
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
| 2725 |
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
| 2726 |
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
| 2727 |
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|