Canis Lupus commited on
Commit
ebe63a9
·
unverified ·
1 Parent(s): 48b9b5d

whisper : allow non-CoreML fallback when Core ML cannot be loaded (#812)

Browse files

if the Core ML model cannot be loaded, continue without Core ML instead of
returning. This allows a single build to transcribe using Core ML models
where available, and regular models when not.

Files changed (1) hide show
  1. whisper.cpp +254 -239
whisper.cpp CHANGED
@@ -592,7 +592,7 @@ struct whisper_state {
592
 
593
  std::string path_model; // populated by whisper_init_from_file()
594
  #ifdef WHISPER_USE_COREML
595
- whisper_coreml_context * ctx_coreml;
596
  #endif
597
 
598
  // [EXPERIMENTAL] token-level timestamps data
@@ -1385,320 +1385,331 @@ static bool whisper_encode_internal(
1385
  }
1386
  }
1387
 
1388
- #ifndef WHISPER_USE_COREML
1389
  struct ggml_tensor * cur;
1390
 
1391
- // convolution + gelu
 
 
 
 
 
 
1392
  {
1393
- wstate.use_buf(ctx0, 1);
 
 
1394
 
1395
- cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
1396
- cur = ggml_add(ctx0,
1397
- ggml_repeat(ctx0,
1398
- model.e_conv_1_b,
1399
- cur),
1400
- cur);
1401
 
1402
- cur = ggml_gelu(ctx0, cur);
1403
 
1404
- wstate.use_buf(ctx0, 0);
1405
 
1406
- cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
1407
- cur = ggml_add(ctx0,
1408
- ggml_repeat(ctx0,
1409
- model.e_conv_2_b,
1410
- cur),
1411
- cur);
1412
 
1413
- cur = ggml_gelu(ctx0, cur);
1414
- }
1415
 
1416
- wstate.use_buf(ctx0, 3);
1417
 
1418
- // ===================================================================
1419
- // NOTE: experimenting with partial evaluation of the encoder (ignore)
1420
- //static int iter = -1;
1421
- //const int n_iter = 1500/n_ctx;
1422
 
1423
- //iter = (iter + 1) % n_iter;
1424
 
1425
- //if (iter == 0) {
1426
- // memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
1427
- // memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
1428
- //}
1429
 
1430
- static int iter = 0;
1431
 
1432
- const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
1433
- const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
1434
 
1435
- struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
1436
 
1437
- cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
1438
 
1439
- // ===================================================================
1440
 
1441
- // original:
1442
- //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
1443
 
1444
- struct ggml_tensor * inpL = cur;
1445
 
1446
- for (int il = 0; il < n_layer; ++il) {
1447
- const auto & layer = model.layers_encoder[il];
1448
 
1449
- // norm
1450
- {
1451
- wstate.use_buf(ctx0, 0);
1452
 
1453
- cur = ggml_norm(ctx0, inpL);
1454
 
1455
- // cur = ln_0_w*cur + ln_0_b
1456
- cur = ggml_add(ctx0,
1457
- ggml_mul(ctx0,
1458
- ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
1459
- cur),
1460
- ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
1461
- }
1462
 
1463
- // self-attention
1464
- {
1465
- wstate.use_buf(ctx0, 1);
1466
 
1467
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
1468
- layer.attn_q_w,
1469
- cur);
1470
 
1471
- Qcur = ggml_add(ctx0,
1472
- ggml_repeat(ctx0,
1473
- layer.attn_q_b,
1474
- Qcur),
1475
- Qcur);
1476
 
1477
- //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1478
 
1479
- // note: no bias for Key
1480
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1481
- layer.attn_k_w,
1482
- cur);
1483
 
1484
- //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1485
 
1486
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1487
- layer.attn_v_w,
1488
- cur);
1489
 
1490
- Vcur = ggml_add(ctx0,
1491
- ggml_repeat(ctx0,
1492
- layer.attn_v_b,
1493
- Vcur),
1494
- Vcur);
1495
 
1496
- // ------
1497
 
1498
- wstate.use_buf(ctx0, 0);
1499
 
1500
- #ifdef WHISPER_USE_FLASH_ATTN
1501
- struct ggml_tensor * Q =
1502
- ggml_permute(ctx0,
1503
- ggml_cpy(ctx0,
1504
- Qcur,
1505
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1506
- 0, 2, 1, 3);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1507
 
1508
- struct ggml_tensor * K =
1509
- ggml_permute(ctx0,
1510
- ggml_cpy(ctx0,
1511
- Kcur,
1512
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1513
- 0, 2, 1, 3);
1514
 
1515
- struct ggml_tensor * V =
1516
- ggml_cpy(ctx0,
1517
- ggml_permute(ctx0,
1518
- ggml_reshape_3d(ctx0,
1519
- Vcur,
1520
- n_state/n_head, n_head, n_ctx),
1521
- 1, 2, 0, 3),
1522
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
1523
-
1524
- struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
1525
- #else
1526
- struct ggml_tensor * Q =
1527
- ggml_permute(ctx0,
1528
- ggml_cpy(ctx0,
1529
- Qcur,
1530
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
1531
- 0, 2, 1, 3);
1532
 
1533
- struct ggml_tensor * K =
1534
- ggml_permute(ctx0,
1535
- ggml_cpy(ctx0,
1536
- Kcur,
1537
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1538
- 0, 2, 1, 3);
1539
 
1540
- // K * Q
1541
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
1542
 
1543
- struct ggml_tensor * KQ_scaled =
1544
- ggml_scale(ctx0,
1545
- KQ,
1546
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
1547
- );
1548
 
1549
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
 
 
 
1550
 
1551
- //struct ggml_tensor * V_trans =
1552
- // ggml_permute(ctx0,
1553
- // ggml_cpy(ctx0,
1554
- // Vcur,
1555
- // ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1556
- // 1, 2, 0, 3);
1557
 
1558
- //struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
 
1559
 
1560
- struct ggml_tensor * V =
1561
- ggml_cpy(ctx0,
1562
- ggml_permute(ctx0,
1563
- ggml_reshape_3d(ctx0,
1564
- Vcur,
1565
- n_state/n_head, n_head, n_ctx),
1566
- 0, 2, 1, 3),
1567
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
1568
- );
1569
-
1570
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
1571
- #endif
1572
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1573
 
1574
- wstate.use_buf(ctx0, 1);
 
 
 
 
1575
 
1576
- cur = ggml_cpy(ctx0,
1577
- KQV_merged,
1578
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
1579
- }
1580
 
1581
- // projection
1582
- {
1583
- wstate.use_buf(ctx0, 0);
1584
 
1585
- cur = ggml_mul_mat(ctx0,
1586
- layer.attn_ln_1_w,
1587
- cur);
 
 
 
 
1588
 
1589
- wstate.use_buf(ctx0, 1);
 
1590
 
1591
- cur = ggml_add(ctx0,
1592
- ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
1593
- cur);
1594
- }
 
1595
 
1596
- wstate.use_buf(ctx0, 2);
 
 
 
1597
 
1598
- // add the input
1599
- cur = ggml_add(ctx0, cur, inpL);
1600
 
1601
- struct ggml_tensor * inpFF = cur;
 
 
1602
 
1603
- // feed-forward network
1604
- {
1605
- // norm
1606
- {
1607
  wstate.use_buf(ctx0, 0);
1608
 
1609
- cur = ggml_norm(ctx0, inpFF);
 
1610
 
1611
  wstate.use_buf(ctx0, 1);
1612
 
1613
- // cur = mlp_ln_w*cur + mlp_ln_b
1614
- cur = ggml_add(ctx0,
1615
- ggml_mul(ctx0,
1616
- ggml_repeat(ctx0, layer.mlp_ln_w, cur),
1617
- cur),
1618
- ggml_repeat(ctx0, layer.mlp_ln_b, cur));
1619
- }
1620
 
1621
- #ifdef WHISPER_USE_FLASH_FF
1622
- wstate.use_buf(ctx0, 0);
1623
 
1624
- cur = ggml_flash_ff(ctx0,
1625
- ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
1626
- layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
1627
- #else
1628
- wstate.use_buf(ctx0, 0);
1629
 
1630
- // fully connected
1631
- cur = ggml_mul_mat(ctx0,
1632
- layer.mlp_0_w,
1633
- cur);
1634
 
1635
- wstate.use_buf(ctx0, 1);
 
1636
 
1637
- cur = ggml_add(ctx0,
1638
- ggml_repeat(ctx0, layer.mlp_0_b, cur),
1639
- cur);
1640
 
 
 
1641
  wstate.use_buf(ctx0, 0);
1642
 
1643
- // GELU activation
1644
- cur = ggml_gelu(ctx0, cur);
1645
 
1646
  wstate.use_buf(ctx0, 1);
1647
 
1648
- // projection
1649
- cur = ggml_mul_mat(ctx0,
1650
- layer.mlp_1_w,
1651
- cur);
1652
-
1653
- wstate.use_buf(ctx0, 0);
1654
-
1655
  cur = ggml_add(ctx0,
1656
- ggml_repeat(ctx0, layer.mlp_1_b, cur),
1657
- cur);
1658
- #endif
 
1659
  }
1660
 
1661
- wstate.use_buf(ctx0, 3);
1662
-
1663
- inpL = ggml_add(ctx0, cur, inpFF);
1664
- }
1665
 
1666
- cur = inpL;
1667
-
1668
- // norm
1669
- {
1670
- wstate.use_buf(ctx0, 0);
1671
-
1672
- cur = ggml_norm(ctx0, cur);
1673
 
1674
- wstate.use_buf(ctx0, 1);
 
1675
 
1676
- // cur = ln_f_g*cur + ln_f_b
1677
- cur = ggml_add(ctx0,
1678
- ggml_mul(ctx0,
1679
- ggml_repeat(ctx0, model.e_ln_w, cur),
1680
- cur),
1681
- ggml_repeat(ctx0, model.e_ln_b, cur));
1682
  }
1683
-
1684
- wstate.use_buf(ctx0, -1);
1685
-
1686
- // run the computation
1687
  {
1688
- struct ggml_cgraph gf = {};
1689
- gf.n_threads = n_threads;
1690
 
1691
- ggml_build_forward_expand(&gf, cur);
1692
- ggml_graph_compute(ctx0, &gf);
1693
 
1694
- //ggml_graph_print(&gf);
1695
  }
1696
- #else
1697
- wstate.use_buf(ctx0, -1);
1698
-
1699
- struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
1700
-
1701
- whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
1702
  #endif
1703
 
1704
  // cur
@@ -2569,10 +2580,12 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
2569
  state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
2570
  if (!state->ctx_coreml) {
2571
  fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
 
2572
  return nullptr;
 
 
 
2573
  }
2574
-
2575
- fprintf(stderr, "%s: Core ML model loaded\n", __func__);
2576
  #endif
2577
 
2578
  state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
@@ -2745,8 +2758,10 @@ void whisper_free_state(struct whisper_state * state)
2745
  }
2746
 
2747
  #ifdef WHISPER_USE_COREML
2748
- whisper_coreml_free(state->ctx_coreml);
2749
- state->ctx_coreml = nullptr;
 
 
2750
  #endif
2751
 
2752
  delete state;
 
592
 
593
  std::string path_model; // populated by whisper_init_from_file()
594
  #ifdef WHISPER_USE_COREML
595
+ whisper_coreml_context * ctx_coreml = nullptr;
596
  #endif
597
 
598
  // [EXPERIMENTAL] token-level timestamps data
 
1385
  }
1386
  }
1387
 
 
1388
  struct ggml_tensor * cur;
1389
 
1390
+ #ifndef WHISPER_USE_COREML
1391
+ const bool use_coreml = false;
1392
+ #else
1393
+ const bool use_coreml = wstate.ctx_coreml != nullptr;
1394
+ #endif
1395
+
1396
+ if (!use_coreml)
1397
  {
1398
+ // convolution + gelu
1399
+ {
1400
+ wstate.use_buf(ctx0, 1);
1401
 
1402
+ cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
1403
+ cur = ggml_add(ctx0,
1404
+ ggml_repeat(ctx0,
1405
+ model.e_conv_1_b,
1406
+ cur),
1407
+ cur);
1408
 
1409
+ cur = ggml_gelu(ctx0, cur);
1410
 
1411
+ wstate.use_buf(ctx0, 0);
1412
 
1413
+ cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
1414
+ cur = ggml_add(ctx0,
1415
+ ggml_repeat(ctx0,
1416
+ model.e_conv_2_b,
1417
+ cur),
1418
+ cur);
1419
 
1420
+ cur = ggml_gelu(ctx0, cur);
1421
+ }
1422
 
1423
+ wstate.use_buf(ctx0, 3);
1424
 
1425
+ // ===================================================================
1426
+ // NOTE: experimenting with partial evaluation of the encoder (ignore)
1427
+ //static int iter = -1;
1428
+ //const int n_iter = 1500/n_ctx;
1429
 
1430
+ //iter = (iter + 1) % n_iter;
1431
 
1432
+ //if (iter == 0) {
1433
+ // memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
1434
+ // memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
1435
+ //}
1436
 
1437
+ static int iter = 0;
1438
 
1439
+ const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
1440
+ const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
1441
 
1442
+ struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
1443
 
1444
+ cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
1445
 
1446
+ // ===================================================================
1447
 
1448
+ // original:
1449
+ //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
1450
 
1451
+ struct ggml_tensor * inpL = cur;
1452
 
1453
+ for (int il = 0; il < n_layer; ++il) {
1454
+ const auto & layer = model.layers_encoder[il];
1455
 
1456
+ // norm
1457
+ {
1458
+ wstate.use_buf(ctx0, 0);
1459
 
1460
+ cur = ggml_norm(ctx0, inpL);
1461
 
1462
+ // cur = ln_0_w*cur + ln_0_b
1463
+ cur = ggml_add(ctx0,
1464
+ ggml_mul(ctx0,
1465
+ ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
1466
+ cur),
1467
+ ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
1468
+ }
1469
 
1470
+ // self-attention
1471
+ {
1472
+ wstate.use_buf(ctx0, 1);
1473
 
1474
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
1475
+ layer.attn_q_w,
1476
+ cur);
1477
 
1478
+ Qcur = ggml_add(ctx0,
1479
+ ggml_repeat(ctx0,
1480
+ layer.attn_q_b,
1481
+ Qcur),
1482
+ Qcur);
1483
 
1484
+ //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1485
 
1486
+ // note: no bias for Key
1487
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
1488
+ layer.attn_k_w,
1489
+ cur);
1490
 
1491
+ //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
1492
 
1493
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
1494
+ layer.attn_v_w,
1495
+ cur);
1496
 
1497
+ Vcur = ggml_add(ctx0,
1498
+ ggml_repeat(ctx0,
1499
+ layer.attn_v_b,
1500
+ Vcur),
1501
+ Vcur);
1502
 
1503
+ // ------
1504
 
1505
+ wstate.use_buf(ctx0, 0);
1506
 
1507
+ #ifdef WHISPER_USE_FLASH_ATTN
1508
+ struct ggml_tensor * Q =
1509
+ ggml_permute(ctx0,
1510
+ ggml_cpy(ctx0,
1511
+ Qcur,
1512
+ ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1513
+ 0, 2, 1, 3);
1514
+
1515
+ struct ggml_tensor * K =
1516
+ ggml_permute(ctx0,
1517
+ ggml_cpy(ctx0,
1518
+ Kcur,
1519
+ ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1520
+ 0, 2, 1, 3);
1521
+
1522
+ struct ggml_tensor * V =
1523
+ ggml_cpy(ctx0,
1524
+ ggml_permute(ctx0,
1525
+ ggml_reshape_3d(ctx0,
1526
+ Vcur,
1527
+ n_state/n_head, n_head, n_ctx),
1528
+ 1, 2, 0, 3),
1529
+ ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
1530
+
1531
+ struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
1532
+ #else
1533
+ struct ggml_tensor * Q =
1534
+ ggml_permute(ctx0,
1535
+ ggml_cpy(ctx0,
1536
+ Qcur,
1537
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
1538
+ 0, 2, 1, 3);
1539
+
1540
+ struct ggml_tensor * K =
1541
+ ggml_permute(ctx0,
1542
+ ggml_cpy(ctx0,
1543
+ Kcur,
1544
+ ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1545
+ 0, 2, 1, 3);
1546
+
1547
+ // K * Q
1548
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1549
+
1550
+ struct ggml_tensor * KQ_scaled =
1551
+ ggml_scale(ctx0,
1552
+ KQ,
1553
+ ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
1554
+ );
1555
+
1556
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
1557
+
1558
+ //struct ggml_tensor * V_trans =
1559
+ // ggml_permute(ctx0,
1560
+ // ggml_cpy(ctx0,
1561
+ // Vcur,
1562
+ // ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1563
+ // 1, 2, 0, 3);
1564
+
1565
+ //struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
1566
+
1567
+ struct ggml_tensor * V =
1568
+ ggml_cpy(ctx0,
1569
+ ggml_permute(ctx0,
1570
+ ggml_reshape_3d(ctx0,
1571
+ Vcur,
1572
+ n_state/n_head, n_head, n_ctx),
1573
+ 0, 2, 1, 3),
1574
+ ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
1575
+ );
1576
+
1577
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
1578
+ #endif
1579
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1580
 
1581
+ wstate.use_buf(ctx0, 1);
 
 
 
 
 
1582
 
1583
+ cur = ggml_cpy(ctx0,
1584
+ KQV_merged,
1585
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
1586
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
1587
 
1588
+ // projection
1589
+ {
1590
+ wstate.use_buf(ctx0, 0);
 
 
 
1591
 
1592
+ cur = ggml_mul_mat(ctx0,
1593
+ layer.attn_ln_1_w,
1594
+ cur);
1595
 
1596
+ wstate.use_buf(ctx0, 1);
 
 
 
 
1597
 
1598
+ cur = ggml_add(ctx0,
1599
+ ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
1600
+ cur);
1601
+ }
1602
 
1603
+ wstate.use_buf(ctx0, 2);
 
 
 
 
 
1604
 
1605
+ // add the input
1606
+ cur = ggml_add(ctx0, cur, inpL);
1607
 
1608
+ struct ggml_tensor * inpFF = cur;
 
 
 
 
 
 
 
 
 
 
 
 
1609
 
1610
+ // feed-forward network
1611
+ {
1612
+ // norm
1613
+ {
1614
+ wstate.use_buf(ctx0, 0);
1615
 
1616
+ cur = ggml_norm(ctx0, inpFF);
 
 
 
1617
 
1618
+ wstate.use_buf(ctx0, 1);
 
 
1619
 
1620
+ // cur = mlp_ln_w*cur + mlp_ln_b
1621
+ cur = ggml_add(ctx0,
1622
+ ggml_mul(ctx0,
1623
+ ggml_repeat(ctx0, layer.mlp_ln_w, cur),
1624
+ cur),
1625
+ ggml_repeat(ctx0, layer.mlp_ln_b, cur));
1626
+ }
1627
 
1628
+ #ifdef WHISPER_USE_FLASH_FF
1629
+ wstate.use_buf(ctx0, 0);
1630
 
1631
+ cur = ggml_flash_ff(ctx0,
1632
+ ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
1633
+ layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
1634
+ #else
1635
+ wstate.use_buf(ctx0, 0);
1636
 
1637
+ // fully connected
1638
+ cur = ggml_mul_mat(ctx0,
1639
+ layer.mlp_0_w,
1640
+ cur);
1641
 
1642
+ wstate.use_buf(ctx0, 1);
 
1643
 
1644
+ cur = ggml_add(ctx0,
1645
+ ggml_repeat(ctx0, layer.mlp_0_b, cur),
1646
+ cur);
1647
 
 
 
 
 
1648
  wstate.use_buf(ctx0, 0);
1649
 
1650
+ // GELU activation
1651
+ cur = ggml_gelu(ctx0, cur);
1652
 
1653
  wstate.use_buf(ctx0, 1);
1654
 
1655
+ // projection
1656
+ cur = ggml_mul_mat(ctx0,
1657
+ layer.mlp_1_w,
1658
+ cur);
 
 
 
1659
 
1660
+ wstate.use_buf(ctx0, 0);
 
1661
 
1662
+ cur = ggml_add(ctx0,
1663
+ ggml_repeat(ctx0, layer.mlp_1_b, cur),
1664
+ cur);
1665
+ #endif
1666
+ }
1667
 
1668
+ wstate.use_buf(ctx0, 3);
 
 
 
1669
 
1670
+ inpL = ggml_add(ctx0, cur, inpFF);
1671
+ }
1672
 
1673
+ cur = inpL;
 
 
1674
 
1675
+ // norm
1676
+ {
1677
  wstate.use_buf(ctx0, 0);
1678
 
1679
+ cur = ggml_norm(ctx0, cur);
 
1680
 
1681
  wstate.use_buf(ctx0, 1);
1682
 
1683
+ // cur = ln_f_g*cur + ln_f_b
 
 
 
 
 
 
1684
  cur = ggml_add(ctx0,
1685
+ ggml_mul(ctx0,
1686
+ ggml_repeat(ctx0, model.e_ln_w, cur),
1687
+ cur),
1688
+ ggml_repeat(ctx0, model.e_ln_b, cur));
1689
  }
1690
 
1691
+ wstate.use_buf(ctx0, -1);
 
 
 
1692
 
1693
+ // run the computation
1694
+ {
1695
+ struct ggml_cgraph gf = {};
1696
+ gf.n_threads = n_threads;
 
 
 
1697
 
1698
+ ggml_build_forward_expand(&gf, cur);
1699
+ ggml_graph_compute(ctx0, &gf);
1700
 
1701
+ //ggml_graph_print(&gf);
1702
+ }
 
 
 
 
1703
  }
1704
+ #ifdef WHISPER_USE_COREML
1705
+ else
 
 
1706
  {
1707
+ wstate.use_buf(ctx0, -1);
 
1708
 
1709
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
 
1710
 
1711
+ whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
1712
  }
 
 
 
 
 
 
1713
  #endif
1714
 
1715
  // cur
 
2580
  state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
2581
  if (!state->ctx_coreml) {
2582
  fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
2583
+ #ifndef WHISPER_COREML_ALLOW_FALLBACK
2584
  return nullptr;
2585
+ #endif
2586
+ } else {
2587
+ fprintf(stderr, "%s: Core ML model loaded\n", __func__);
2588
  }
 
 
2589
  #endif
2590
 
2591
  state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
 
2758
  }
2759
 
2760
  #ifdef WHISPER_USE_COREML
2761
+ if (state->ctx_coreml != nullptr) {
2762
+ whisper_coreml_free(state->ctx_coreml);
2763
+ state->ctx_coreml = nullptr;
2764
+ }
2765
  #endif
2766
 
2767
  delete state;