Spaces:
Running
Running
Canis Lupus
commited on
whisper : allow non-CoreML fallback when Core ML cannot be loaded (#812)
Browse filesif the Core ML model cannot be loaded, continue without Core ML instead of
returning. This allows a single build to transcribe using Core ML models
where available, and regular models when not.
- whisper.cpp +254 -239
whisper.cpp
CHANGED
|
@@ -592,7 +592,7 @@ struct whisper_state {
|
|
| 592 |
|
| 593 |
std::string path_model; // populated by whisper_init_from_file()
|
| 594 |
#ifdef WHISPER_USE_COREML
|
| 595 |
-
whisper_coreml_context * ctx_coreml;
|
| 596 |
#endif
|
| 597 |
|
| 598 |
// [EXPERIMENTAL] token-level timestamps data
|
|
@@ -1385,320 +1385,331 @@ static bool whisper_encode_internal(
|
|
| 1385 |
}
|
| 1386 |
}
|
| 1387 |
|
| 1388 |
-
#ifndef WHISPER_USE_COREML
|
| 1389 |
struct ggml_tensor * cur;
|
| 1390 |
|
| 1391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1392 |
{
|
| 1393 |
-
|
|
|
|
|
|
|
| 1394 |
|
| 1395 |
-
|
| 1396 |
-
|
| 1397 |
-
|
| 1398 |
-
|
| 1399 |
-
|
| 1400 |
-
|
| 1401 |
|
| 1402 |
-
|
| 1403 |
|
| 1404 |
-
|
| 1405 |
|
| 1406 |
-
|
| 1407 |
-
|
| 1408 |
-
|
| 1409 |
-
|
| 1410 |
-
|
| 1411 |
-
|
| 1412 |
|
| 1413 |
-
|
| 1414 |
-
|
| 1415 |
|
| 1416 |
-
|
| 1417 |
|
| 1418 |
-
|
| 1419 |
-
|
| 1420 |
-
|
| 1421 |
-
|
| 1422 |
|
| 1423 |
-
|
| 1424 |
|
| 1425 |
-
|
| 1426 |
-
|
| 1427 |
-
|
| 1428 |
-
|
| 1429 |
|
| 1430 |
-
|
| 1431 |
|
| 1432 |
-
|
| 1433 |
-
|
| 1434 |
|
| 1435 |
-
|
| 1436 |
|
| 1437 |
-
|
| 1438 |
|
| 1439 |
-
|
| 1440 |
|
| 1441 |
-
|
| 1442 |
-
|
| 1443 |
|
| 1444 |
-
|
| 1445 |
|
| 1446 |
-
|
| 1447 |
-
|
| 1448 |
|
| 1449 |
-
|
| 1450 |
-
|
| 1451 |
-
|
| 1452 |
|
| 1453 |
-
|
| 1454 |
|
| 1455 |
-
|
| 1456 |
-
|
| 1457 |
-
|
| 1458 |
-
|
| 1459 |
-
|
| 1460 |
-
|
| 1461 |
-
|
| 1462 |
|
| 1463 |
-
|
| 1464 |
-
|
| 1465 |
-
|
| 1466 |
|
| 1467 |
-
|
| 1468 |
-
|
| 1469 |
-
|
| 1470 |
|
| 1471 |
-
|
| 1472 |
-
|
| 1473 |
-
|
| 1474 |
-
|
| 1475 |
-
|
| 1476 |
|
| 1477 |
-
|
| 1478 |
|
| 1479 |
-
|
| 1480 |
-
|
| 1481 |
-
|
| 1482 |
-
|
| 1483 |
|
| 1484 |
-
|
| 1485 |
|
| 1486 |
-
|
| 1487 |
-
|
| 1488 |
-
|
| 1489 |
|
| 1490 |
-
|
| 1491 |
-
|
| 1492 |
-
|
| 1493 |
-
|
| 1494 |
-
|
| 1495 |
|
| 1496 |
-
|
| 1497 |
|
| 1498 |
-
|
| 1499 |
|
| 1500 |
-
#ifdef WHISPER_USE_FLASH_ATTN
|
| 1501 |
-
|
| 1502 |
-
|
| 1503 |
-
|
| 1504 |
-
|
| 1505 |
-
|
| 1506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1507 |
|
| 1508 |
-
|
| 1509 |
-
ggml_permute(ctx0,
|
| 1510 |
-
ggml_cpy(ctx0,
|
| 1511 |
-
Kcur,
|
| 1512 |
-
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
|
| 1513 |
-
0, 2, 1, 3);
|
| 1514 |
|
| 1515 |
-
|
| 1516 |
-
|
| 1517 |
-
|
| 1518 |
-
|
| 1519 |
-
Vcur,
|
| 1520 |
-
n_state/n_head, n_head, n_ctx),
|
| 1521 |
-
1, 2, 0, 3),
|
| 1522 |
-
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
|
| 1523 |
-
|
| 1524 |
-
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
|
| 1525 |
-
#else
|
| 1526 |
-
struct ggml_tensor * Q =
|
| 1527 |
-
ggml_permute(ctx0,
|
| 1528 |
-
ggml_cpy(ctx0,
|
| 1529 |
-
Qcur,
|
| 1530 |
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
|
| 1531 |
-
0, 2, 1, 3);
|
| 1532 |
|
| 1533 |
-
|
| 1534 |
-
|
| 1535 |
-
|
| 1536 |
-
Kcur,
|
| 1537 |
-
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
|
| 1538 |
-
0, 2, 1, 3);
|
| 1539 |
|
| 1540 |
-
|
| 1541 |
-
|
|
|
|
| 1542 |
|
| 1543 |
-
|
| 1544 |
-
ggml_scale(ctx0,
|
| 1545 |
-
KQ,
|
| 1546 |
-
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
| 1547 |
-
);
|
| 1548 |
|
| 1549 |
-
|
|
|
|
|
|
|
|
|
|
| 1550 |
|
| 1551 |
-
|
| 1552 |
-
// ggml_permute(ctx0,
|
| 1553 |
-
// ggml_cpy(ctx0,
|
| 1554 |
-
// Vcur,
|
| 1555 |
-
// ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
|
| 1556 |
-
// 1, 2, 0, 3);
|
| 1557 |
|
| 1558 |
-
//
|
|
|
|
| 1559 |
|
| 1560 |
-
struct ggml_tensor *
|
| 1561 |
-
ggml_cpy(ctx0,
|
| 1562 |
-
ggml_permute(ctx0,
|
| 1563 |
-
ggml_reshape_3d(ctx0,
|
| 1564 |
-
Vcur,
|
| 1565 |
-
n_state/n_head, n_head, n_ctx),
|
| 1566 |
-
0, 2, 1, 3),
|
| 1567 |
-
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
|
| 1568 |
-
);
|
| 1569 |
-
|
| 1570 |
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
|
| 1571 |
-
#endif
|
| 1572 |
-
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
| 1573 |
|
| 1574 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1575 |
|
| 1576 |
-
|
| 1577 |
-
KQV_merged,
|
| 1578 |
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
|
| 1579 |
-
}
|
| 1580 |
|
| 1581 |
-
|
| 1582 |
-
{
|
| 1583 |
-
wstate.use_buf(ctx0, 0);
|
| 1584 |
|
| 1585 |
-
|
| 1586 |
-
|
| 1587 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1588 |
|
| 1589 |
-
|
|
|
|
| 1590 |
|
| 1591 |
-
|
| 1592 |
-
|
| 1593 |
-
|
| 1594 |
-
|
|
|
|
| 1595 |
|
| 1596 |
-
|
|
|
|
|
|
|
|
|
|
| 1597 |
|
| 1598 |
-
|
| 1599 |
-
cur = ggml_add(ctx0, cur, inpL);
|
| 1600 |
|
| 1601 |
-
|
|
|
|
|
|
|
| 1602 |
|
| 1603 |
-
// feed-forward network
|
| 1604 |
-
{
|
| 1605 |
-
// norm
|
| 1606 |
-
{
|
| 1607 |
wstate.use_buf(ctx0, 0);
|
| 1608 |
|
| 1609 |
-
|
|
|
|
| 1610 |
|
| 1611 |
wstate.use_buf(ctx0, 1);
|
| 1612 |
|
| 1613 |
-
//
|
| 1614 |
-
cur =
|
| 1615 |
-
|
| 1616 |
-
|
| 1617 |
-
cur),
|
| 1618 |
-
ggml_repeat(ctx0, layer.mlp_ln_b, cur));
|
| 1619 |
-
}
|
| 1620 |
|
| 1621 |
-
|
| 1622 |
-
wstate.use_buf(ctx0, 0);
|
| 1623 |
|
| 1624 |
-
|
| 1625 |
-
|
| 1626 |
-
|
| 1627 |
-
#
|
| 1628 |
-
|
| 1629 |
|
| 1630 |
-
|
| 1631 |
-
cur = ggml_mul_mat(ctx0,
|
| 1632 |
-
layer.mlp_0_w,
|
| 1633 |
-
cur);
|
| 1634 |
|
| 1635 |
-
|
|
|
|
| 1636 |
|
| 1637 |
-
|
| 1638 |
-
ggml_repeat(ctx0, layer.mlp_0_b, cur),
|
| 1639 |
-
cur);
|
| 1640 |
|
|
|
|
|
|
|
| 1641 |
wstate.use_buf(ctx0, 0);
|
| 1642 |
|
| 1643 |
-
|
| 1644 |
-
cur = ggml_gelu(ctx0, cur);
|
| 1645 |
|
| 1646 |
wstate.use_buf(ctx0, 1);
|
| 1647 |
|
| 1648 |
-
//
|
| 1649 |
-
cur = ggml_mul_mat(ctx0,
|
| 1650 |
-
layer.mlp_1_w,
|
| 1651 |
-
cur);
|
| 1652 |
-
|
| 1653 |
-
wstate.use_buf(ctx0, 0);
|
| 1654 |
-
|
| 1655 |
cur = ggml_add(ctx0,
|
| 1656 |
-
|
| 1657 |
-
|
| 1658 |
-
|
|
|
|
| 1659 |
}
|
| 1660 |
|
| 1661 |
-
wstate.use_buf(ctx0,
|
| 1662 |
-
|
| 1663 |
-
inpL = ggml_add(ctx0, cur, inpFF);
|
| 1664 |
-
}
|
| 1665 |
|
| 1666 |
-
|
| 1667 |
-
|
| 1668 |
-
|
| 1669 |
-
|
| 1670 |
-
wstate.use_buf(ctx0, 0);
|
| 1671 |
-
|
| 1672 |
-
cur = ggml_norm(ctx0, cur);
|
| 1673 |
|
| 1674 |
-
|
|
|
|
| 1675 |
|
| 1676 |
-
|
| 1677 |
-
|
| 1678 |
-
ggml_mul(ctx0,
|
| 1679 |
-
ggml_repeat(ctx0, model.e_ln_w, cur),
|
| 1680 |
-
cur),
|
| 1681 |
-
ggml_repeat(ctx0, model.e_ln_b, cur));
|
| 1682 |
}
|
| 1683 |
-
|
| 1684 |
-
|
| 1685 |
-
|
| 1686 |
-
// run the computation
|
| 1687 |
{
|
| 1688 |
-
|
| 1689 |
-
gf.n_threads = n_threads;
|
| 1690 |
|
| 1691 |
-
|
| 1692 |
-
ggml_graph_compute(ctx0, &gf);
|
| 1693 |
|
| 1694 |
-
|
| 1695 |
}
|
| 1696 |
-
#else
|
| 1697 |
-
wstate.use_buf(ctx0, -1);
|
| 1698 |
-
|
| 1699 |
-
struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
| 1700 |
-
|
| 1701 |
-
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
|
| 1702 |
#endif
|
| 1703 |
|
| 1704 |
// cur
|
|
@@ -2569,10 +2580,12 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 2569 |
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
|
| 2570 |
if (!state->ctx_coreml) {
|
| 2571 |
fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
|
|
|
| 2572 |
return nullptr;
|
|
|
|
|
|
|
|
|
|
| 2573 |
}
|
| 2574 |
-
|
| 2575 |
-
fprintf(stderr, "%s: Core ML model loaded\n", __func__);
|
| 2576 |
#endif
|
| 2577 |
|
| 2578 |
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
|
|
@@ -2745,8 +2758,10 @@ void whisper_free_state(struct whisper_state * state)
|
|
| 2745 |
}
|
| 2746 |
|
| 2747 |
#ifdef WHISPER_USE_COREML
|
| 2748 |
-
|
| 2749 |
-
|
|
|
|
|
|
|
| 2750 |
#endif
|
| 2751 |
|
| 2752 |
delete state;
|
|
|
|
| 592 |
|
| 593 |
std::string path_model; // populated by whisper_init_from_file()
|
| 594 |
#ifdef WHISPER_USE_COREML
|
| 595 |
+
whisper_coreml_context * ctx_coreml = nullptr;
|
| 596 |
#endif
|
| 597 |
|
| 598 |
// [EXPERIMENTAL] token-level timestamps data
|
|
|
|
| 1385 |
}
|
| 1386 |
}
|
| 1387 |
|
|
|
|
| 1388 |
struct ggml_tensor * cur;
|
| 1389 |
|
| 1390 |
+
#ifndef WHISPER_USE_COREML
|
| 1391 |
+
const bool use_coreml = false;
|
| 1392 |
+
#else
|
| 1393 |
+
const bool use_coreml = wstate.ctx_coreml != nullptr;
|
| 1394 |
+
#endif
|
| 1395 |
+
|
| 1396 |
+
if (!use_coreml)
|
| 1397 |
{
|
| 1398 |
+
// convolution + gelu
|
| 1399 |
+
{
|
| 1400 |
+
wstate.use_buf(ctx0, 1);
|
| 1401 |
|
| 1402 |
+
cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
|
| 1403 |
+
cur = ggml_add(ctx0,
|
| 1404 |
+
ggml_repeat(ctx0,
|
| 1405 |
+
model.e_conv_1_b,
|
| 1406 |
+
cur),
|
| 1407 |
+
cur);
|
| 1408 |
|
| 1409 |
+
cur = ggml_gelu(ctx0, cur);
|
| 1410 |
|
| 1411 |
+
wstate.use_buf(ctx0, 0);
|
| 1412 |
|
| 1413 |
+
cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
|
| 1414 |
+
cur = ggml_add(ctx0,
|
| 1415 |
+
ggml_repeat(ctx0,
|
| 1416 |
+
model.e_conv_2_b,
|
| 1417 |
+
cur),
|
| 1418 |
+
cur);
|
| 1419 |
|
| 1420 |
+
cur = ggml_gelu(ctx0, cur);
|
| 1421 |
+
}
|
| 1422 |
|
| 1423 |
+
wstate.use_buf(ctx0, 3);
|
| 1424 |
|
| 1425 |
+
// ===================================================================
|
| 1426 |
+
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
| 1427 |
+
//static int iter = -1;
|
| 1428 |
+
//const int n_iter = 1500/n_ctx;
|
| 1429 |
|
| 1430 |
+
//iter = (iter + 1) % n_iter;
|
| 1431 |
|
| 1432 |
+
//if (iter == 0) {
|
| 1433 |
+
// memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
|
| 1434 |
+
// memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
|
| 1435 |
+
//}
|
| 1436 |
|
| 1437 |
+
static int iter = 0;
|
| 1438 |
|
| 1439 |
+
const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
|
| 1440 |
+
const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
|
| 1441 |
|
| 1442 |
+
struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
|
| 1443 |
|
| 1444 |
+
cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
|
| 1445 |
|
| 1446 |
+
// ===================================================================
|
| 1447 |
|
| 1448 |
+
// original:
|
| 1449 |
+
//cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
|
| 1450 |
|
| 1451 |
+
struct ggml_tensor * inpL = cur;
|
| 1452 |
|
| 1453 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 1454 |
+
const auto & layer = model.layers_encoder[il];
|
| 1455 |
|
| 1456 |
+
// norm
|
| 1457 |
+
{
|
| 1458 |
+
wstate.use_buf(ctx0, 0);
|
| 1459 |
|
| 1460 |
+
cur = ggml_norm(ctx0, inpL);
|
| 1461 |
|
| 1462 |
+
// cur = ln_0_w*cur + ln_0_b
|
| 1463 |
+
cur = ggml_add(ctx0,
|
| 1464 |
+
ggml_mul(ctx0,
|
| 1465 |
+
ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
|
| 1466 |
+
cur),
|
| 1467 |
+
ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
|
| 1468 |
+
}
|
| 1469 |
|
| 1470 |
+
// self-attention
|
| 1471 |
+
{
|
| 1472 |
+
wstate.use_buf(ctx0, 1);
|
| 1473 |
|
| 1474 |
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
|
| 1475 |
+
layer.attn_q_w,
|
| 1476 |
+
cur);
|
| 1477 |
|
| 1478 |
+
Qcur = ggml_add(ctx0,
|
| 1479 |
+
ggml_repeat(ctx0,
|
| 1480 |
+
layer.attn_q_b,
|
| 1481 |
+
Qcur),
|
| 1482 |
+
Qcur);
|
| 1483 |
|
| 1484 |
+
//Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
| 1485 |
|
| 1486 |
+
// note: no bias for Key
|
| 1487 |
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
| 1488 |
+
layer.attn_k_w,
|
| 1489 |
+
cur);
|
| 1490 |
|
| 1491 |
+
//Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
| 1492 |
|
| 1493 |
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
| 1494 |
+
layer.attn_v_w,
|
| 1495 |
+
cur);
|
| 1496 |
|
| 1497 |
+
Vcur = ggml_add(ctx0,
|
| 1498 |
+
ggml_repeat(ctx0,
|
| 1499 |
+
layer.attn_v_b,
|
| 1500 |
+
Vcur),
|
| 1501 |
+
Vcur);
|
| 1502 |
|
| 1503 |
+
// ------
|
| 1504 |
|
| 1505 |
+
wstate.use_buf(ctx0, 0);
|
| 1506 |
|
| 1507 |
+
#ifdef WHISPER_USE_FLASH_ATTN
|
| 1508 |
+
struct ggml_tensor * Q =
|
| 1509 |
+
ggml_permute(ctx0,
|
| 1510 |
+
ggml_cpy(ctx0,
|
| 1511 |
+
Qcur,
|
| 1512 |
+
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
|
| 1513 |
+
0, 2, 1, 3);
|
| 1514 |
+
|
| 1515 |
+
struct ggml_tensor * K =
|
| 1516 |
+
ggml_permute(ctx0,
|
| 1517 |
+
ggml_cpy(ctx0,
|
| 1518 |
+
Kcur,
|
| 1519 |
+
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
|
| 1520 |
+
0, 2, 1, 3);
|
| 1521 |
+
|
| 1522 |
+
struct ggml_tensor * V =
|
| 1523 |
+
ggml_cpy(ctx0,
|
| 1524 |
+
ggml_permute(ctx0,
|
| 1525 |
+
ggml_reshape_3d(ctx0,
|
| 1526 |
+
Vcur,
|
| 1527 |
+
n_state/n_head, n_head, n_ctx),
|
| 1528 |
+
1, 2, 0, 3),
|
| 1529 |
+
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
|
| 1530 |
+
|
| 1531 |
+
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
|
| 1532 |
+
#else
|
| 1533 |
+
struct ggml_tensor * Q =
|
| 1534 |
+
ggml_permute(ctx0,
|
| 1535 |
+
ggml_cpy(ctx0,
|
| 1536 |
+
Qcur,
|
| 1537 |
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
|
| 1538 |
+
0, 2, 1, 3);
|
| 1539 |
+
|
| 1540 |
+
struct ggml_tensor * K =
|
| 1541 |
+
ggml_permute(ctx0,
|
| 1542 |
+
ggml_cpy(ctx0,
|
| 1543 |
+
Kcur,
|
| 1544 |
+
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
|
| 1545 |
+
0, 2, 1, 3);
|
| 1546 |
+
|
| 1547 |
+
// K * Q
|
| 1548 |
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
| 1549 |
+
|
| 1550 |
+
struct ggml_tensor * KQ_scaled =
|
| 1551 |
+
ggml_scale(ctx0,
|
| 1552 |
+
KQ,
|
| 1553 |
+
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
|
| 1554 |
+
);
|
| 1555 |
+
|
| 1556 |
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
|
| 1557 |
+
|
| 1558 |
+
//struct ggml_tensor * V_trans =
|
| 1559 |
+
// ggml_permute(ctx0,
|
| 1560 |
+
// ggml_cpy(ctx0,
|
| 1561 |
+
// Vcur,
|
| 1562 |
+
// ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
|
| 1563 |
+
// 1, 2, 0, 3);
|
| 1564 |
+
|
| 1565 |
+
//struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
| 1566 |
+
|
| 1567 |
+
struct ggml_tensor * V =
|
| 1568 |
+
ggml_cpy(ctx0,
|
| 1569 |
+
ggml_permute(ctx0,
|
| 1570 |
+
ggml_reshape_3d(ctx0,
|
| 1571 |
+
Vcur,
|
| 1572 |
+
n_state/n_head, n_head, n_ctx),
|
| 1573 |
+
0, 2, 1, 3),
|
| 1574 |
+
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
|
| 1575 |
+
);
|
| 1576 |
+
|
| 1577 |
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
|
| 1578 |
+
#endif
|
| 1579 |
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
| 1580 |
|
| 1581 |
+
wstate.use_buf(ctx0, 1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1582 |
|
| 1583 |
+
cur = ggml_cpy(ctx0,
|
| 1584 |
+
KQV_merged,
|
| 1585 |
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
|
| 1586 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1587 |
|
| 1588 |
+
// projection
|
| 1589 |
+
{
|
| 1590 |
+
wstate.use_buf(ctx0, 0);
|
|
|
|
|
|
|
|
|
|
| 1591 |
|
| 1592 |
+
cur = ggml_mul_mat(ctx0,
|
| 1593 |
+
layer.attn_ln_1_w,
|
| 1594 |
+
cur);
|
| 1595 |
|
| 1596 |
+
wstate.use_buf(ctx0, 1);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1597 |
|
| 1598 |
+
cur = ggml_add(ctx0,
|
| 1599 |
+
ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
|
| 1600 |
+
cur);
|
| 1601 |
+
}
|
| 1602 |
|
| 1603 |
+
wstate.use_buf(ctx0, 2);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1604 |
|
| 1605 |
+
// add the input
|
| 1606 |
+
cur = ggml_add(ctx0, cur, inpL);
|
| 1607 |
|
| 1608 |
+
struct ggml_tensor * inpFF = cur;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1609 |
|
| 1610 |
+
// feed-forward network
|
| 1611 |
+
{
|
| 1612 |
+
// norm
|
| 1613 |
+
{
|
| 1614 |
+
wstate.use_buf(ctx0, 0);
|
| 1615 |
|
| 1616 |
+
cur = ggml_norm(ctx0, inpFF);
|
|
|
|
|
|
|
|
|
|
| 1617 |
|
| 1618 |
+
wstate.use_buf(ctx0, 1);
|
|
|
|
|
|
|
| 1619 |
|
| 1620 |
+
// cur = mlp_ln_w*cur + mlp_ln_b
|
| 1621 |
+
cur = ggml_add(ctx0,
|
| 1622 |
+
ggml_mul(ctx0,
|
| 1623 |
+
ggml_repeat(ctx0, layer.mlp_ln_w, cur),
|
| 1624 |
+
cur),
|
| 1625 |
+
ggml_repeat(ctx0, layer.mlp_ln_b, cur));
|
| 1626 |
+
}
|
| 1627 |
|
| 1628 |
+
#ifdef WHISPER_USE_FLASH_FF
|
| 1629 |
+
wstate.use_buf(ctx0, 0);
|
| 1630 |
|
| 1631 |
+
cur = ggml_flash_ff(ctx0,
|
| 1632 |
+
ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
|
| 1633 |
+
layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
|
| 1634 |
+
#else
|
| 1635 |
+
wstate.use_buf(ctx0, 0);
|
| 1636 |
|
| 1637 |
+
// fully connected
|
| 1638 |
+
cur = ggml_mul_mat(ctx0,
|
| 1639 |
+
layer.mlp_0_w,
|
| 1640 |
+
cur);
|
| 1641 |
|
| 1642 |
+
wstate.use_buf(ctx0, 1);
|
|
|
|
| 1643 |
|
| 1644 |
+
cur = ggml_add(ctx0,
|
| 1645 |
+
ggml_repeat(ctx0, layer.mlp_0_b, cur),
|
| 1646 |
+
cur);
|
| 1647 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1648 |
wstate.use_buf(ctx0, 0);
|
| 1649 |
|
| 1650 |
+
// GELU activation
|
| 1651 |
+
cur = ggml_gelu(ctx0, cur);
|
| 1652 |
|
| 1653 |
wstate.use_buf(ctx0, 1);
|
| 1654 |
|
| 1655 |
+
// projection
|
| 1656 |
+
cur = ggml_mul_mat(ctx0,
|
| 1657 |
+
layer.mlp_1_w,
|
| 1658 |
+
cur);
|
|
|
|
|
|
|
|
|
|
| 1659 |
|
| 1660 |
+
wstate.use_buf(ctx0, 0);
|
|
|
|
| 1661 |
|
| 1662 |
+
cur = ggml_add(ctx0,
|
| 1663 |
+
ggml_repeat(ctx0, layer.mlp_1_b, cur),
|
| 1664 |
+
cur);
|
| 1665 |
+
#endif
|
| 1666 |
+
}
|
| 1667 |
|
| 1668 |
+
wstate.use_buf(ctx0, 3);
|
|
|
|
|
|
|
|
|
|
| 1669 |
|
| 1670 |
+
inpL = ggml_add(ctx0, cur, inpFF);
|
| 1671 |
+
}
|
| 1672 |
|
| 1673 |
+
cur = inpL;
|
|
|
|
|
|
|
| 1674 |
|
| 1675 |
+
// norm
|
| 1676 |
+
{
|
| 1677 |
wstate.use_buf(ctx0, 0);
|
| 1678 |
|
| 1679 |
+
cur = ggml_norm(ctx0, cur);
|
|
|
|
| 1680 |
|
| 1681 |
wstate.use_buf(ctx0, 1);
|
| 1682 |
|
| 1683 |
+
// cur = ln_f_g*cur + ln_f_b
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1684 |
cur = ggml_add(ctx0,
|
| 1685 |
+
ggml_mul(ctx0,
|
| 1686 |
+
ggml_repeat(ctx0, model.e_ln_w, cur),
|
| 1687 |
+
cur),
|
| 1688 |
+
ggml_repeat(ctx0, model.e_ln_b, cur));
|
| 1689 |
}
|
| 1690 |
|
| 1691 |
+
wstate.use_buf(ctx0, -1);
|
|
|
|
|
|
|
|
|
|
| 1692 |
|
| 1693 |
+
// run the computation
|
| 1694 |
+
{
|
| 1695 |
+
struct ggml_cgraph gf = {};
|
| 1696 |
+
gf.n_threads = n_threads;
|
|
|
|
|
|
|
|
|
|
| 1697 |
|
| 1698 |
+
ggml_build_forward_expand(&gf, cur);
|
| 1699 |
+
ggml_graph_compute(ctx0, &gf);
|
| 1700 |
|
| 1701 |
+
//ggml_graph_print(&gf);
|
| 1702 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1703 |
}
|
| 1704 |
+
#ifdef WHISPER_USE_COREML
|
| 1705 |
+
else
|
|
|
|
|
|
|
| 1706 |
{
|
| 1707 |
+
wstate.use_buf(ctx0, -1);
|
|
|
|
| 1708 |
|
| 1709 |
+
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
|
|
|
| 1710 |
|
| 1711 |
+
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
|
| 1712 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1713 |
#endif
|
| 1714 |
|
| 1715 |
// cur
|
|
|
|
| 2580 |
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
|
| 2581 |
if (!state->ctx_coreml) {
|
| 2582 |
fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
| 2583 |
+
#ifndef WHISPER_COREML_ALLOW_FALLBACK
|
| 2584 |
return nullptr;
|
| 2585 |
+
#endif
|
| 2586 |
+
} else {
|
| 2587 |
+
fprintf(stderr, "%s: Core ML model loaded\n", __func__);
|
| 2588 |
}
|
|
|
|
|
|
|
| 2589 |
#endif
|
| 2590 |
|
| 2591 |
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
|
|
|
|
| 2758 |
}
|
| 2759 |
|
| 2760 |
#ifdef WHISPER_USE_COREML
|
| 2761 |
+
if (state->ctx_coreml != nullptr) {
|
| 2762 |
+
whisper_coreml_free(state->ctx_coreml);
|
| 2763 |
+
state->ctx_coreml = nullptr;
|
| 2764 |
+
}
|
| 2765 |
#endif
|
| 2766 |
|
| 2767 |
delete state;
|