Spaces:
Running
Running
Oliver Simons
commited on
Commit
·
bb523fb
1
Parent(s):
536128f
cuda : Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs (llama/14741)
Browse files* Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs
Gemma3n uses Matrix-Matrix addition as part of their input processing,
wrongly triggering CUDA_GRAPH disablement on NVGPUs even when batch-size
of 1 is used.
* Exclude `project_per_layer_input` by matching node names
This ensures that all other graphs which don't exhibit this pattern do
not have their behavior changed.
* Revert unnecessary formatting changes
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -2590,6 +2590,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
|
| 2590 |
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
| 2591 |
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
| 2592 |
|
|
|
|
|
|
|
|
|
|
| 2593 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 2594 |
ggml_tensor * node = cgraph->nodes[i];
|
| 2595 |
|
|
@@ -2611,9 +2614,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
|
| 2611 |
#endif
|
| 2612 |
}
|
| 2613 |
|
| 2614 |
-
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
|
| 2615 |
-
// disable CUDA graphs for batch size > 1 for now
|
| 2616 |
-
//
|
|
|
|
|
|
|
|
|
|
| 2617 |
use_cuda_graph = false;
|
| 2618 |
#ifndef NDEBUG
|
| 2619 |
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
|
|
|
| 2590 |
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
| 2591 |
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
| 2592 |
|
| 2593 |
+
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
|
| 2594 |
+
const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
|
| 2595 |
+
|
| 2596 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 2597 |
ggml_tensor * node = cgraph->nodes[i];
|
| 2598 |
|
|
|
|
| 2614 |
#endif
|
| 2615 |
}
|
| 2616 |
|
| 2617 |
+
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) {
|
| 2618 |
+
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
|
| 2619 |
+
// by means of matching node names. See
|
| 2620 |
+
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
|
| 2621 |
+
// https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
|
| 2622 |
+
// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
|
| 2623 |
use_cuda_graph = false;
|
| 2624 |
#ifndef NDEBUG
|
| 2625 |
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|