Alan Gray commited on
Commit
3944ae5
·
1 Parent(s): 1b9d0f0

ggml: Re-enable CUDA graphs in presence of CONT and DUP nodes (llama/12970)

Browse files
ggml/src/ggml-cuda/cpy.cu CHANGED
@@ -551,7 +551,7 @@ static void ggml_cpy_f16_f16_cuda(
551
  (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
552
  }
553
 
554
- void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
555
  const int64_t ne = ggml_nelements(src0);
556
  GGML_ASSERT(ne == ggml_nelements(src1));
557
 
@@ -588,7 +588,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
588
  char ** dest_ptrs_d = nullptr;
589
  int graph_cpynode_index = -1;
590
  #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
591
- if(ctx.cuda_graph->use_cpy_indirection) {
592
  dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
593
  graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
594
  }
@@ -636,7 +636,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
636
  ggml_type_name(src0->type), ggml_type_name(src1->type));
637
  }
638
  #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
639
- if(ctx.cuda_graph->use_cpy_indirection) {
640
  ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
641
  }
642
  #endif
@@ -645,7 +645,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
645
 
646
  void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
647
  const ggml_tensor * src0 = dst->src[0];
648
- ggml_cuda_cpy(ctx, src0, dst);
 
649
  }
650
 
651
  void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
 
551
  (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
552
  }
553
 
554
+ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
555
  const int64_t ne = ggml_nelements(src0);
556
  GGML_ASSERT(ne == ggml_nelements(src1));
557
 
 
588
  char ** dest_ptrs_d = nullptr;
589
  int graph_cpynode_index = -1;
590
  #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
591
+ if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
592
  dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
593
  graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
594
  }
 
636
  ggml_type_name(src0->type), ggml_type_name(src1->type));
637
  }
638
  #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
639
+ if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
640
  ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
641
  }
642
  #endif
 
645
 
646
  void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
647
  const ggml_tensor * src0 = dst->src[0];
648
+ bool disable_indirection = true;
649
+ ggml_cuda_cpy(ctx, src0, dst, disable_indirection);
650
  }
651
 
652
  void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
ggml/src/ggml-cuda/cpy.cuh CHANGED
@@ -2,7 +2,7 @@
2
 
3
  #define CUDA_CPY_BLOCK_SIZE 64
4
 
5
- void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
6
 
7
  void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
8
 
 
2
 
3
  #define CUDA_CPY_BLOCK_SIZE 64
4
 
5
+ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection = false);
6
 
7
  void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
8
 
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -2489,7 +2489,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
2489
  #endif
2490
  }
2491
 
2492
- if (node->op == GGML_OP_MUL_MAT_ID || node->op == GGML_OP_CONT || node->op == GGML_OP_DUP) {
2493
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2494
  #ifndef NDEBUG
2495
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
 
2489
  #endif
2490
  }
2491
 
2492
+ if (node->op == GGML_OP_MUL_MAT_ID) {
2493
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2494
  #ifndef NDEBUG
2495
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);