Diego Devesa commited on
Commit
e9f5612
·
1 Parent(s): a687ec3

sched : fix multiple evaluations of the same graph with pipeline parallelism (llama/14855)

Browse files
Files changed (1) hide show
  1. ggml/src/ggml-backend.cpp +8 -5
ggml/src/ggml-backend.cpp CHANGED
@@ -647,6 +647,7 @@ struct ggml_backend_sched {
647
  // pipeline parallelism support
648
  int n_copies;
649
  int cur_copy;
 
650
  ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
651
  struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
652
  int n_graph_inputs;
@@ -1433,8 +1434,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1433
  }
1434
  }
1435
 
1436
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1437
-
1438
  return GGML_STATUS_SUCCESS;
1439
  }
1440
 
@@ -1535,10 +1534,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1535
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1536
  GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1537
 
1538
- ggml_backend_sched_split_graph(sched, measure_graph);
1539
-
1540
  ggml_backend_sched_synchronize(sched);
1541
 
 
 
1542
  if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1543
  return false;
1544
  }
@@ -1550,6 +1549,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1550
 
1551
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1552
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
 
 
 
 
1553
 
1554
  ggml_backend_sched_split_graph(sched, graph);
1555
 
@@ -1590,7 +1593,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1590
  // if the graph is not already allocated, always use copy 0 after a synchronization
1591
  // this ensures that during generation the same copy is used every time,
1592
  // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1593
- sched->cur_copy = 0;
1594
  }
1595
  }
1596
 
 
647
  // pipeline parallelism support
648
  int n_copies;
649
  int cur_copy;
650
+ int next_copy;
651
  ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
652
  struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
653
  int n_graph_inputs;
 
1434
  }
1435
  }
1436
 
 
 
1437
  return GGML_STATUS_SUCCESS;
1438
  }
1439
 
 
1534
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1535
  GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1536
 
 
 
1537
  ggml_backend_sched_synchronize(sched);
1538
 
1539
+ ggml_backend_sched_split_graph(sched, measure_graph);
1540
+
1541
  if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1542
  return false;
1543
  }
 
1549
 
1550
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1551
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1552
+ GGML_ASSERT(!sched->is_alloc);
1553
+
1554
+ sched->cur_copy = sched->next_copy;
1555
+ sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
1556
 
1557
  ggml_backend_sched_split_graph(sched, graph);
1558
 
 
1593
  // if the graph is not already allocated, always use copy 0 after a synchronization
1594
  // this ensures that during generation the same copy is used every time,
1595
  // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1596
+ sched->next_copy = 0;
1597
  }
1598
  }
1599