Spaces:
Running
Running
Alan Gray
commited on
Commit
·
3944ae5
1
Parent(s):
1b9d0f0
ggml: Re-enable CUDA graphs in presence of CONT and DUP nodes (llama/12970)
Browse files
ggml/src/ggml-cuda/cpy.cu
CHANGED
|
@@ -551,7 +551,7 @@ static void ggml_cpy_f16_f16_cuda(
|
|
| 551 |
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
| 552 |
}
|
| 553 |
|
| 554 |
-
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
|
| 555 |
const int64_t ne = ggml_nelements(src0);
|
| 556 |
GGML_ASSERT(ne == ggml_nelements(src1));
|
| 557 |
|
|
@@ -588,7 +588,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|
| 588 |
char ** dest_ptrs_d = nullptr;
|
| 589 |
int graph_cpynode_index = -1;
|
| 590 |
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
| 591 |
-
if(ctx.cuda_graph->use_cpy_indirection) {
|
| 592 |
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
| 593 |
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
| 594 |
}
|
|
@@ -636,7 +636,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|
| 636 |
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
| 637 |
}
|
| 638 |
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
| 639 |
-
if(ctx.cuda_graph->use_cpy_indirection) {
|
| 640 |
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
| 641 |
}
|
| 642 |
#endif
|
|
@@ -645,7 +645,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|
| 645 |
|
| 646 |
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 647 |
const ggml_tensor * src0 = dst->src[0];
|
| 648 |
-
|
|
|
|
| 649 |
}
|
| 650 |
|
| 651 |
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
|
|
|
| 551 |
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
| 552 |
}
|
| 553 |
|
| 554 |
+
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
|
| 555 |
const int64_t ne = ggml_nelements(src0);
|
| 556 |
GGML_ASSERT(ne == ggml_nelements(src1));
|
| 557 |
|
|
|
|
| 588 |
char ** dest_ptrs_d = nullptr;
|
| 589 |
int graph_cpynode_index = -1;
|
| 590 |
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
| 591 |
+
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
| 592 |
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
| 593 |
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
| 594 |
}
|
|
|
|
| 636 |
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
| 637 |
}
|
| 638 |
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
| 639 |
+
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
| 640 |
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
| 641 |
}
|
| 642 |
#endif
|
|
|
|
| 645 |
|
| 646 |
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 647 |
const ggml_tensor * src0 = dst->src[0];
|
| 648 |
+
bool disable_indirection = true;
|
| 649 |
+
ggml_cuda_cpy(ctx, src0, dst, disable_indirection);
|
| 650 |
}
|
| 651 |
|
| 652 |
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
ggml/src/ggml-cuda/cpy.cuh
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
#define CUDA_CPY_BLOCK_SIZE 64
|
| 4 |
|
| 5 |
-
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
|
| 6 |
|
| 7 |
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
| 8 |
|
|
|
|
| 2 |
|
| 3 |
#define CUDA_CPY_BLOCK_SIZE 64
|
| 4 |
|
| 5 |
+
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection = false);
|
| 6 |
|
| 7 |
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
| 8 |
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -2489,7 +2489,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
|
| 2489 |
#endif
|
| 2490 |
}
|
| 2491 |
|
| 2492 |
-
if (node->op == GGML_OP_MUL_MAT_ID
|
| 2493 |
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
| 2494 |
#ifndef NDEBUG
|
| 2495 |
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
|
|
|
|
| 2489 |
#endif
|
| 2490 |
}
|
| 2491 |
|
| 2492 |
+
if (node->op == GGML_OP_MUL_MAT_ID) {
|
| 2493 |
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
| 2494 |
#ifndef NDEBUG
|
| 2495 |
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
|