ggerganov commited on
Commit
0e9101f
·
unverified ·
1 Parent(s): 75c5f9c

whisper : load the model into multiple buffers of max size 1GB (#1763)

Browse files
Files changed (1) hide show
  1. whisper.cpp +53 -9
whisper.cpp CHANGED
@@ -701,7 +701,7 @@ struct whisper_model {
701
  struct ggml_context * ctx;
702
 
703
  // the model backend data is read-only and can be shared between processors
704
- struct ggml_backend_buffer * buffer;
705
 
706
  // tensors
707
  int n_loaded;
@@ -1514,24 +1514,64 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1514
 
1515
  wctx.backend = whisper_backend_init(wctx.params);
1516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1517
  {
1518
  size_t size_main = 0;
 
 
 
1519
 
1520
  for (const auto & t : model.tensors) {
1521
- size_main += ggml_nbytes(t.second) + ggml_tensor_overhead();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1522
  }
1523
 
1524
- model.buffer = ggml_backend_alloc_buffer(wctx.backend, size_main);
1525
 
1526
- WHISPER_LOG_INFO("%s: %8s buffer size = %8.2f MB\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6);
1527
  }
1528
 
1529
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
 
 
 
1530
 
1531
  // allocate tensors in the backend buffers
1532
  {
1533
  for (const auto & t : model.tensors) {
1534
- ggml_allocr_alloc(alloc, t.second);
1535
  }
1536
  }
1537
 
@@ -1632,7 +1672,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1632
  }
1633
  }
1634
 
1635
- ggml_allocr_free(alloc);
 
 
1636
 
1637
  wctx.t_load_us = ggml_time_us() - t_start_us;
1638
 
@@ -3376,8 +3418,10 @@ void whisper_free(struct whisper_context * ctx) {
3376
  ggml_free(ctx->model.ctx);
3377
  }
3378
 
3379
- if (ctx->model.buffer) {
3380
- ggml_backend_buffer_free(ctx->model.buffer);
 
 
3381
  }
3382
 
3383
  whisper_free_state(ctx->state);
 
701
  struct ggml_context * ctx;
702
 
703
  // the model backend data is read-only and can be shared between processors
704
+ std::vector<struct ggml_backend_buffer *> buffers;
705
 
706
  // tensors
707
  int n_loaded;
 
1514
 
1515
  wctx.backend = whisper_backend_init(wctx.params);
1516
 
1517
+ // some devices have a limit on the maximum size of single memory buffer
1518
+ // for example, iPhones are limited to 1GB per buffer
1519
+ // to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the
1520
+ // model weights between them
1521
+ //
1522
+ // the map_t2b maps tensor names to buffer indices
1523
+ // as we iterate over the tensors, we will allocate new buffers when the current one is full
1524
+ //
1525
+ // finally, we create a separate allocator for each buffer and use it to allocate the tensors
1526
+ // we keep the allocators alive until all the tensors are loaded
1527
+
1528
+ GGML_ASSERT(model.buffers.empty());
1529
+
1530
+ std::map<std::string, int> map_t2b;
1531
+
1532
  {
1533
  size_t size_main = 0;
1534
+ size_t size_cur = 0;
1535
+
1536
+ static const size_t GB = 1024ull*1024ull*1024ull;
1537
 
1538
  for (const auto & t : model.tensors) {
1539
+ const size_t cur = ggml_nbytes(t.second) + ggml_tensor_overhead();
1540
+
1541
+ // adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer
1542
+ if (size_cur + cur > GB) {
1543
+ GGML_ASSERT(size_cur > 0 && "A tensor is too large to fit in a single buffer");
1544
+
1545
+ model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
1546
+
1547
+ size_cur = cur;
1548
+ }
1549
+
1550
+ map_t2b[t.first] = model.buffers.size();
1551
+
1552
+ size_cur += cur;
1553
+ size_main += cur;
1554
+ }
1555
+
1556
+ // allocate the last buffer if needed
1557
+ if (size_cur > 0) {
1558
+ model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
1559
  }
1560
 
1561
+ GGML_ASSERT(model.buffers.size() > 0);
1562
 
1563
+ WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB (%d buffers)\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6, (int) model.buffers.size());
1564
  }
1565
 
1566
+ std::vector<ggml_allocr *> allocs(model.buffers.size());
1567
+ for (size_t i = 0; i < allocs.size(); ++i) {
1568
+ allocs[i] = ggml_allocr_new_from_buffer(model.buffers[i]);
1569
+ }
1570
 
1571
  // allocate tensors in the backend buffers
1572
  {
1573
  for (const auto & t : model.tensors) {
1574
+ ggml_allocr_alloc(allocs[map_t2b[t.first]], t.second);
1575
  }
1576
  }
1577
 
 
1672
  }
1673
  }
1674
 
1675
+ for (auto & alloc : allocs) {
1676
+ ggml_allocr_free(alloc);
1677
+ }
1678
 
1679
  wctx.t_load_us = ggml_time_us() - t_start_us;
1680
 
 
3418
  ggml_free(ctx->model.ctx);
3419
  }
3420
 
3421
+ for (auto & buffer : ctx->model.buffers) {
3422
+ if (buffer) {
3423
+ ggml_backend_buffer_free(buffer);
3424
+ }
3425
  }
3426
 
3427
  whisper_free_state(ctx->state);