Oliver Simons commited on
Commit
bb523fb
·
1 Parent(s): 536128f

cuda : Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs (llama/14741)

Browse files

* Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs

Gemma3n uses Matrix-Matrix addition as part of their input processing,
wrongly triggering CUDA_GRAPH disablement on NVGPUs even when batch-size
of 1 is used.

* Exclude `project_per_layer_input` by matching node names

This ensures that all other graphs which don't exhibit this pattern do
not have their behavior changed.

* Revert unnecessary formatting changes

Files changed (1) hide show
  1. ggml/src/ggml-cuda/ggml-cuda.cu +9 -3
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -2590,6 +2590,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
2590
  // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2591
  cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
2592
 
 
 
 
2593
  for (int i = 0; i < cgraph->n_nodes; i++) {
2594
  ggml_tensor * node = cgraph->nodes[i];
2595
 
@@ -2611,9 +2614,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
2611
  #endif
2612
  }
2613
 
2614
- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2615
- // disable CUDA graphs for batch size > 1 for now.
2616
- // Changes in batch size or context size can cause changes to the grid size of some kernels.
 
 
 
2617
  use_cuda_graph = false;
2618
  #ifndef NDEBUG
2619
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
 
2590
  // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2591
  cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
2592
 
2593
+ const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
2594
+ const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
2595
+
2596
  for (int i = 0; i < cgraph->n_nodes; i++) {
2597
  ggml_tensor * node = cgraph->nodes[i];
2598
 
 
2614
  #endif
2615
  }
2616
 
2617
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) {
2618
+ // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
2619
+ // by means of matching node names. See
2620
+ // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
2621
+ // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
2622
+ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
2623
  use_cuda_graph = false;
2624
  #ifndef NDEBUG
2625
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);