Diego Devesa commited on
Commit
1c0a5c0
·
1 Parent(s): 6b6155b

sched : avoid changing cur_copy when a graph is already allocated (llama/13922)

Browse files
Files changed (1) hide show
  1. ggml/src/ggml-backend.cpp +10 -5
ggml/src/ggml-backend.cpp CHANGED
@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1340
  // allocate graph
1341
  if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1342
  // the re-allocation may cause the split inputs to be moved to a different address
1343
- ggml_backend_sched_synchronize(sched);
 
 
 
1344
  #ifndef NDEBUG
1345
  GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1346
  #endif
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
1564
 
1565
  ggml_backend_sched_split_graph(sched, graph);
1566
 
1567
-
1568
  if (!ggml_backend_sched_alloc_splits(sched)) {
1569
  return false;
1570
  }
@@ -1598,9 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1598
  for (int i = 0; i < sched->n_backends; i++) {
1599
  ggml_backend_synchronize(sched->backends[i]);
1600
  }
1601
- // reset the current copy to 0 so that the graphs will be similar during generation
1602
- // necessary for CUDA graphs
1603
- sched->cur_copy = 0;
 
 
 
1604
  }
1605
 
1606
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
 
1340
  // allocate graph
1341
  if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1342
  // the re-allocation may cause the split inputs to be moved to a different address
1343
+ // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
1344
+ for (int i = 0; i < sched->n_backends; i++) {
1345
+ ggml_backend_synchronize(sched->backends[i]);
1346
+ }
1347
  #ifndef NDEBUG
1348
  GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1349
  #endif
 
1567
 
1568
  ggml_backend_sched_split_graph(sched, graph);
1569
 
 
1570
  if (!ggml_backend_sched_alloc_splits(sched)) {
1571
  return false;
1572
  }
 
1600
  for (int i = 0; i < sched->n_backends; i++) {
1601
  ggml_backend_synchronize(sched->backends[i]);
1602
  }
1603
+ if (!sched->is_alloc) {
1604
+ // if the graph is not already allocated, always use copy 0 after a synchronization
1605
+ // this ensures that during generation the same copy is used every time,
1606
+ // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1607
+ sched->cur_copy = 0;
1608
+ }
1609
  }
1610
 
1611
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {