Spaces:
Running
Running
whisper : load the model into multiple buffers of max size 1GB (#1763)
Browse files- whisper.cpp +53 -9
whisper.cpp
CHANGED
|
@@ -701,7 +701,7 @@ struct whisper_model {
|
|
| 701 |
struct ggml_context * ctx;
|
| 702 |
|
| 703 |
// the model backend data is read-only and can be shared between processors
|
| 704 |
-
struct ggml_backend_buffer
|
| 705 |
|
| 706 |
// tensors
|
| 707 |
int n_loaded;
|
|
@@ -1514,24 +1514,64 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1514 |
|
| 1515 |
wctx.backend = whisper_backend_init(wctx.params);
|
| 1516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1517 |
{
|
| 1518 |
size_t size_main = 0;
|
|
|
|
|
|
|
|
|
|
| 1519 |
|
| 1520 |
for (const auto & t : model.tensors) {
|
| 1521 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1522 |
}
|
| 1523 |
|
| 1524 |
-
model.
|
| 1525 |
|
| 1526 |
-
WHISPER_LOG_INFO("%s: %8s
|
| 1527 |
}
|
| 1528 |
|
| 1529 |
-
ggml_allocr
|
|
|
|
|
|
|
|
|
|
| 1530 |
|
| 1531 |
// allocate tensors in the backend buffers
|
| 1532 |
{
|
| 1533 |
for (const auto & t : model.tensors) {
|
| 1534 |
-
ggml_allocr_alloc(
|
| 1535 |
}
|
| 1536 |
}
|
| 1537 |
|
|
@@ -1632,7 +1672,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1632 |
}
|
| 1633 |
}
|
| 1634 |
|
| 1635 |
-
|
|
|
|
|
|
|
| 1636 |
|
| 1637 |
wctx.t_load_us = ggml_time_us() - t_start_us;
|
| 1638 |
|
|
@@ -3376,8 +3418,10 @@ void whisper_free(struct whisper_context * ctx) {
|
|
| 3376 |
ggml_free(ctx->model.ctx);
|
| 3377 |
}
|
| 3378 |
|
| 3379 |
-
|
| 3380 |
-
|
|
|
|
|
|
|
| 3381 |
}
|
| 3382 |
|
| 3383 |
whisper_free_state(ctx->state);
|
|
|
|
| 701 |
struct ggml_context * ctx;
|
| 702 |
|
| 703 |
// the model backend data is read-only and can be shared between processors
|
| 704 |
+
std::vector<struct ggml_backend_buffer *> buffers;
|
| 705 |
|
| 706 |
// tensors
|
| 707 |
int n_loaded;
|
|
|
|
| 1514 |
|
| 1515 |
wctx.backend = whisper_backend_init(wctx.params);
|
| 1516 |
|
| 1517 |
+
// some devices have a limit on the maximum size of single memory buffer
|
| 1518 |
+
// for example, iPhones are limited to 1GB per buffer
|
| 1519 |
+
// to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the
|
| 1520 |
+
// model weights between them
|
| 1521 |
+
//
|
| 1522 |
+
// the map_t2b maps tensor names to buffer indices
|
| 1523 |
+
// as we iterate over the tensors, we will allocate new buffers when the current one is full
|
| 1524 |
+
//
|
| 1525 |
+
// finally, we create a separate allocator for each buffer and use it to allocate the tensors
|
| 1526 |
+
// we keep the allocators alive until all the tensors are loaded
|
| 1527 |
+
|
| 1528 |
+
GGML_ASSERT(model.buffers.empty());
|
| 1529 |
+
|
| 1530 |
+
std::map<std::string, int> map_t2b;
|
| 1531 |
+
|
| 1532 |
{
|
| 1533 |
size_t size_main = 0;
|
| 1534 |
+
size_t size_cur = 0;
|
| 1535 |
+
|
| 1536 |
+
static const size_t GB = 1024ull*1024ull*1024ull;
|
| 1537 |
|
| 1538 |
for (const auto & t : model.tensors) {
|
| 1539 |
+
const size_t cur = ggml_nbytes(t.second) + ggml_tensor_overhead();
|
| 1540 |
+
|
| 1541 |
+
// adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer
|
| 1542 |
+
if (size_cur + cur > GB) {
|
| 1543 |
+
GGML_ASSERT(size_cur > 0 && "A tensor is too large to fit in a single buffer");
|
| 1544 |
+
|
| 1545 |
+
model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
|
| 1546 |
+
|
| 1547 |
+
size_cur = cur;
|
| 1548 |
+
}
|
| 1549 |
+
|
| 1550 |
+
map_t2b[t.first] = model.buffers.size();
|
| 1551 |
+
|
| 1552 |
+
size_cur += cur;
|
| 1553 |
+
size_main += cur;
|
| 1554 |
+
}
|
| 1555 |
+
|
| 1556 |
+
// allocate the last buffer if needed
|
| 1557 |
+
if (size_cur > 0) {
|
| 1558 |
+
model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
|
| 1559 |
}
|
| 1560 |
|
| 1561 |
+
GGML_ASSERT(model.buffers.size() > 0);
|
| 1562 |
|
| 1563 |
+
WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB (%d buffers)\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6, (int) model.buffers.size());
|
| 1564 |
}
|
| 1565 |
|
| 1566 |
+
std::vector<ggml_allocr *> allocs(model.buffers.size());
|
| 1567 |
+
for (size_t i = 0; i < allocs.size(); ++i) {
|
| 1568 |
+
allocs[i] = ggml_allocr_new_from_buffer(model.buffers[i]);
|
| 1569 |
+
}
|
| 1570 |
|
| 1571 |
// allocate tensors in the backend buffers
|
| 1572 |
{
|
| 1573 |
for (const auto & t : model.tensors) {
|
| 1574 |
+
ggml_allocr_alloc(allocs[map_t2b[t.first]], t.second);
|
| 1575 |
}
|
| 1576 |
}
|
| 1577 |
|
|
|
|
| 1672 |
}
|
| 1673 |
}
|
| 1674 |
|
| 1675 |
+
for (auto & alloc : allocs) {
|
| 1676 |
+
ggml_allocr_free(alloc);
|
| 1677 |
+
}
|
| 1678 |
|
| 1679 |
wctx.t_load_us = ggml_time_us() - t_start_us;
|
| 1680 |
|
|
|
|
| 3418 |
ggml_free(ctx->model.ctx);
|
| 3419 |
}
|
| 3420 |
|
| 3421 |
+
for (auto & buffer : ctx->model.buffers) {
|
| 3422 |
+
if (buffer) {
|
| 3423 |
+
ggml_backend_buffer_free(buffer);
|
| 3424 |
+
}
|
| 3425 |
}
|
| 3426 |
|
| 3427 |
whisper_free_state(ctx->state);
|