Spaces:
Running
Running
wip : WASM 128-bit SIMD support
Browse files- CMakeLists.txt +6 -6
- bindings/javascript/CMakeLists.txt +3 -2
- bindings/javascript/emscripten.cpp +2 -1
- bindings/javascript/whisper.js +0 -0
- ggml.c +177 -3
CMakeLists.txt
CHANGED
|
@@ -123,15 +123,15 @@ else()
|
|
| 123 |
if (MSVC)
|
| 124 |
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2 /D_CRT_SECURE_NO_WARNINGS=1")
|
| 125 |
else()
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
endif()
|
| 128 |
endif()
|
| 129 |
|
| 130 |
-
if (EMSCRIPTEN)
|
| 131 |
-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -msimd128")
|
| 132 |
-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
| 133 |
-
endif()
|
| 134 |
-
|
| 135 |
# whisper - this is the main library of the project
|
| 136 |
|
| 137 |
set(TARGET whisper)
|
|
|
|
| 123 |
if (MSVC)
|
| 124 |
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2 /D_CRT_SECURE_NO_WARNINGS=1")
|
| 125 |
else()
|
| 126 |
+
if (EMSCRIPTEN)
|
| 127 |
+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -msimd128")
|
| 128 |
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
| 129 |
+
else()
|
| 130 |
+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
|
| 131 |
+
endif()
|
| 132 |
endif()
|
| 133 |
endif()
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
# whisper - this is the main library of the project
|
| 136 |
|
| 137 |
set(TARGET whisper)
|
bindings/javascript/CMakeLists.txt
CHANGED
|
@@ -21,13 +21,14 @@ if (WHISPER_WASM_SINGLE_FILE)
|
|
| 21 |
)
|
| 22 |
endif()
|
| 23 |
|
|
|
|
| 24 |
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
| 25 |
--bind \
|
| 26 |
-s MODULARIZE=1 \
|
| 27 |
-s ASSERTIONS=1 \
|
| 28 |
-s USE_PTHREADS=1 \
|
| 29 |
-
-s PTHREAD_POOL_SIZE=
|
| 30 |
-
-s
|
| 31 |
-s FORCE_FILESYSTEM=1 \
|
| 32 |
-s EXPORT_NAME=\"'whisper_factory'\" \
|
| 33 |
${EXTRA_FLAGS} \
|
|
|
|
| 21 |
)
|
| 22 |
endif()
|
| 23 |
|
| 24 |
+
#-s TOTAL_MEMORY=536870912 \
|
| 25 |
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
| 26 |
--bind \
|
| 27 |
-s MODULARIZE=1 \
|
| 28 |
-s ASSERTIONS=1 \
|
| 29 |
-s USE_PTHREADS=1 \
|
| 30 |
+
-s PTHREAD_POOL_SIZE=9 \
|
| 31 |
+
-s ALLOW_MEMORY_GROWTH=1 \
|
| 32 |
-s FORCE_FILESYSTEM=1 \
|
| 33 |
-s EXPORT_NAME=\"'whisper_factory'\" \
|
| 34 |
${EXTRA_FLAGS} \
|
bindings/javascript/emscripten.cpp
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
#include <emscripten/bind.h>
|
| 5 |
|
| 6 |
#include <vector>
|
|
|
|
| 7 |
|
| 8 |
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
| 9 |
|
|
@@ -47,7 +48,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
|
| 47 |
params.print_special_tokens = false;
|
| 48 |
params.translate = false;
|
| 49 |
params.language = "en";
|
| 50 |
-
params.n_threads =
|
| 51 |
params.offset_ms = 0;
|
| 52 |
|
| 53 |
std::vector<float> pcmf32;
|
|
|
|
| 4 |
#include <emscripten/bind.h>
|
| 5 |
|
| 6 |
#include <vector>
|
| 7 |
+
#include <thread>
|
| 8 |
|
| 9 |
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
| 10 |
|
|
|
|
| 48 |
params.print_special_tokens = false;
|
| 49 |
params.translate = false;
|
| 50 |
params.language = "en";
|
| 51 |
+
params.n_threads = std::min(8, (int) std::thread::hardware_concurrency());
|
| 52 |
params.offset_ms = 0;
|
| 53 |
|
| 54 |
std::vector<float> pcmf32;
|
bindings/javascript/whisper.js
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml.c
CHANGED
|
@@ -73,7 +73,11 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
| 73 |
|
| 74 |
#else
|
| 75 |
|
|
|
|
|
|
|
|
|
|
| 76 |
#include <immintrin.h>
|
|
|
|
| 77 |
|
| 78 |
// FP16 <-> FP32
|
| 79 |
// ref: https://github.com/Maratyszcza/FP16
|
|
@@ -288,7 +292,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
|
|
| 288 |
sumf += x[i]*y[i];
|
| 289 |
}
|
| 290 |
#elif defined(__AVX2__)
|
| 291 |
-
// AVX 256-bit
|
| 292 |
const int n32 = (n & ~31);
|
| 293 |
|
| 294 |
__m256 sum0 = _mm256_setzero_ps();
|
|
@@ -330,6 +334,45 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
|
|
| 330 |
for (int i = n32; i < n; ++i) {
|
| 331 |
sumf += x[i]*y[i];
|
| 332 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
#else
|
| 334 |
// scalar
|
| 335 |
for (int i = 0; i < n; ++i) {
|
|
@@ -446,7 +489,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 446 |
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
| 447 |
}
|
| 448 |
#elif defined(__AVX2__)
|
| 449 |
-
// AVX 256-bit
|
| 450 |
const int n32 = (n & ~31);
|
| 451 |
|
| 452 |
__m256 sum0 = _mm256_setzero_ps();
|
|
@@ -489,6 +532,54 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 489 |
//GGML_ASSERT(false);
|
| 490 |
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
| 491 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
#else
|
| 493 |
for (int i = 0; i < n; ++i) {
|
| 494 |
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
|
@@ -535,7 +626,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
|
| 535 |
y[i] += x[i]*v;
|
| 536 |
}
|
| 537 |
#elif defined(__AVX2__)
|
| 538 |
-
// AVX 256-bit
|
| 539 |
const int n32 = (n & ~31);
|
| 540 |
|
| 541 |
const __m256 v4 = _mm256_set1_ps(v);
|
|
@@ -569,6 +660,41 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
|
| 569 |
for (int i = n32; i < n; ++i) {
|
| 570 |
y[i] += x[i]*v;
|
| 571 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
#else
|
| 573 |
// scalar
|
| 574 |
for (int i = 0; i < n; ++i) {
|
|
@@ -696,6 +822,54 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 696 |
GGML_ASSERT(false);
|
| 697 |
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
|
| 698 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
#else
|
| 700 |
for (int i = 0; i < n; ++i) {
|
| 701 |
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
|
|
|
|
| 73 |
|
| 74 |
#else
|
| 75 |
|
| 76 |
+
#ifdef __wasm_simd128__
|
| 77 |
+
#include <wasm_simd128.h>
|
| 78 |
+
#else
|
| 79 |
#include <immintrin.h>
|
| 80 |
+
#endif
|
| 81 |
|
| 82 |
// FP16 <-> FP32
|
| 83 |
// ref: https://github.com/Maratyszcza/FP16
|
|
|
|
| 292 |
sumf += x[i]*y[i];
|
| 293 |
}
|
| 294 |
#elif defined(__AVX2__)
|
| 295 |
+
// AVX 256-bit
|
| 296 |
const int n32 = (n & ~31);
|
| 297 |
|
| 298 |
__m256 sum0 = _mm256_setzero_ps();
|
|
|
|
| 334 |
for (int i = n32; i < n; ++i) {
|
| 335 |
sumf += x[i]*y[i];
|
| 336 |
}
|
| 337 |
+
#elif defined(__wasm_simd128__)
|
| 338 |
+
// WASM 128-bit
|
| 339 |
+
const int n16 = (n & ~15);
|
| 340 |
+
|
| 341 |
+
v128_t sum0 = wasm_f32x4_splat(0);
|
| 342 |
+
v128_t sum1 = wasm_f32x4_splat(0);
|
| 343 |
+
v128_t sum2 = wasm_f32x4_splat(0);
|
| 344 |
+
v128_t sum3 = wasm_f32x4_splat(0);
|
| 345 |
+
|
| 346 |
+
v128_t x0, x1, x2, x3;
|
| 347 |
+
v128_t y0, y1, y2, y3;
|
| 348 |
+
|
| 349 |
+
for (int i = 0; i < n16; i += 16) {
|
| 350 |
+
x0 = wasm_v128_load(x + i + 0);
|
| 351 |
+
x1 = wasm_v128_load(x + i + 4);
|
| 352 |
+
x2 = wasm_v128_load(x + i + 8);
|
| 353 |
+
x3 = wasm_v128_load(x + i + 12);
|
| 354 |
+
|
| 355 |
+
y0 = wasm_v128_load(y + i + 0);
|
| 356 |
+
y1 = wasm_v128_load(y + i + 4);
|
| 357 |
+
y2 = wasm_v128_load(y + i + 8);
|
| 358 |
+
y3 = wasm_v128_load(y + i + 12);
|
| 359 |
+
|
| 360 |
+
sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
|
| 361 |
+
sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
|
| 362 |
+
sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
|
| 363 |
+
sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
sum0 = wasm_f32x4_add(sum0, sum1);
|
| 367 |
+
sum2 = wasm_f32x4_add(sum2, sum3);
|
| 368 |
+
sum0 = wasm_f32x4_add(sum0, sum2);
|
| 369 |
+
|
| 370 |
+
sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);
|
| 371 |
+
|
| 372 |
+
// leftovers
|
| 373 |
+
for (int i = n16; i < n; ++i) {
|
| 374 |
+
sumf += x[i]*y[i];
|
| 375 |
+
}
|
| 376 |
#else
|
| 377 |
// scalar
|
| 378 |
for (int i = 0; i < n; ++i) {
|
|
|
|
| 489 |
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
| 490 |
}
|
| 491 |
#elif defined(__AVX2__)
|
| 492 |
+
// AVX 256-bit
|
| 493 |
const int n32 = (n & ~31);
|
| 494 |
|
| 495 |
__m256 sum0 = _mm256_setzero_ps();
|
|
|
|
| 532 |
//GGML_ASSERT(false);
|
| 533 |
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
| 534 |
}
|
| 535 |
+
#elif defined(__wasm_simd128__)
|
| 536 |
+
// WASM 128-bit
|
| 537 |
+
const int n16 = (n & ~15);
|
| 538 |
+
|
| 539 |
+
v128_t sum0 = wasm_f32x4_splat(0.0f);
|
| 540 |
+
v128_t sum1 = wasm_f32x4_splat(0.0f);
|
| 541 |
+
v128_t sum2 = wasm_f32x4_splat(0.0f);
|
| 542 |
+
v128_t sum3 = wasm_f32x4_splat(0.0f);
|
| 543 |
+
|
| 544 |
+
v128_t x0, x1, x2, x3;
|
| 545 |
+
v128_t y0, y1, y2, y3;
|
| 546 |
+
|
| 547 |
+
float tx[16];
|
| 548 |
+
float ty[16];
|
| 549 |
+
|
| 550 |
+
for (int i = 0; i < n16; i += 16) {
|
| 551 |
+
for (int k = 0; k < 16; ++k) {
|
| 552 |
+
tx[k] = ggml_fp16_to_fp32(x[i + k]);
|
| 553 |
+
ty[k] = ggml_fp16_to_fp32(y[i + k]);
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
x0 = wasm_v128_load(tx + 0);
|
| 557 |
+
x1 = wasm_v128_load(tx + 4);
|
| 558 |
+
x2 = wasm_v128_load(tx + 8);
|
| 559 |
+
x3 = wasm_v128_load(tx + 12);
|
| 560 |
+
|
| 561 |
+
y0 = wasm_v128_load(ty + 0);
|
| 562 |
+
y1 = wasm_v128_load(ty + 4);
|
| 563 |
+
y2 = wasm_v128_load(ty + 8);
|
| 564 |
+
y3 = wasm_v128_load(ty + 12);
|
| 565 |
+
|
| 566 |
+
sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
|
| 567 |
+
sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
|
| 568 |
+
sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
|
| 569 |
+
sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
sum0 = wasm_f32x4_add(sum0, sum1);
|
| 573 |
+
sum2 = wasm_f32x4_add(sum2, sum3);
|
| 574 |
+
sum0 = wasm_f32x4_add(sum0, sum2);
|
| 575 |
+
|
| 576 |
+
sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);
|
| 577 |
+
|
| 578 |
+
// leftovers
|
| 579 |
+
for (int i = n16; i < n; ++i) {
|
| 580 |
+
//GGML_ASSERT(false);
|
| 581 |
+
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
| 582 |
+
}
|
| 583 |
#else
|
| 584 |
for (int i = 0; i < n; ++i) {
|
| 585 |
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
|
|
|
|
| 626 |
y[i] += x[i]*v;
|
| 627 |
}
|
| 628 |
#elif defined(__AVX2__)
|
| 629 |
+
// AVX 256-bit
|
| 630 |
const int n32 = (n & ~31);
|
| 631 |
|
| 632 |
const __m256 v4 = _mm256_set1_ps(v);
|
|
|
|
| 660 |
for (int i = n32; i < n; ++i) {
|
| 661 |
y[i] += x[i]*v;
|
| 662 |
}
|
| 663 |
+
#elif defined(__wasm_simd128__)
|
| 664 |
+
// WASM SIMD 128-bit
|
| 665 |
+
const int n16 = (n & ~15);
|
| 666 |
+
|
| 667 |
+
const v128_t v4 = wasm_f32x4_splat(v);
|
| 668 |
+
|
| 669 |
+
v128_t x0, x1, x2, x3;
|
| 670 |
+
v128_t y0, y1, y2, y3;
|
| 671 |
+
|
| 672 |
+
for (int i = 0; i < n16; i += 16) {
|
| 673 |
+
x0 = wasm_v128_load(x + i + 0);
|
| 674 |
+
x1 = wasm_v128_load(x + i + 4);
|
| 675 |
+
x2 = wasm_v128_load(x + i + 8);
|
| 676 |
+
x3 = wasm_v128_load(x + i + 12);
|
| 677 |
+
|
| 678 |
+
y0 = wasm_v128_load(y + i + 0);
|
| 679 |
+
y1 = wasm_v128_load(y + i + 4);
|
| 680 |
+
y2 = wasm_v128_load(y + i + 8);
|
| 681 |
+
y3 = wasm_v128_load(y + i + 12);
|
| 682 |
+
|
| 683 |
+
y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
|
| 684 |
+
y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
|
| 685 |
+
y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
|
| 686 |
+
y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));
|
| 687 |
+
|
| 688 |
+
wasm_v128_store(y + i + 0, y0);
|
| 689 |
+
wasm_v128_store(y + i + 4, y1);
|
| 690 |
+
wasm_v128_store(y + i + 8, y2);
|
| 691 |
+
wasm_v128_store(y + i + 12, y3);
|
| 692 |
+
}
|
| 693 |
+
|
| 694 |
+
// leftovers
|
| 695 |
+
for (int i = n16; i < n; ++i) {
|
| 696 |
+
y[i] += x[i]*v;
|
| 697 |
+
}
|
| 698 |
#else
|
| 699 |
// scalar
|
| 700 |
for (int i = 0; i < n; ++i) {
|
|
|
|
| 822 |
GGML_ASSERT(false);
|
| 823 |
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
|
| 824 |
}
|
| 825 |
+
#elif defined(__wasm_simd128__)
|
| 826 |
+
// WASM SIMD 128-bit
|
| 827 |
+
const int n16 = (n & ~15);
|
| 828 |
+
|
| 829 |
+
const v128_t v4 = wasm_f32x4_splat(v);
|
| 830 |
+
|
| 831 |
+
v128_t x0, x1, x2, x3;
|
| 832 |
+
v128_t y0, y1, y2, y3;
|
| 833 |
+
|
| 834 |
+
float tx[16];
|
| 835 |
+
float ty[16];
|
| 836 |
+
|
| 837 |
+
for (int i = 0; i < n16; i += 16) {
|
| 838 |
+
for (int k = 0; k < 16; ++k) {
|
| 839 |
+
tx[k] = ggml_fp16_to_fp32(x[i + k]);
|
| 840 |
+
ty[k] = ggml_fp16_to_fp32(y[i + k]);
|
| 841 |
+
}
|
| 842 |
+
|
| 843 |
+
x0 = wasm_v128_load(tx + 0);
|
| 844 |
+
x1 = wasm_v128_load(tx + 4);
|
| 845 |
+
x2 = wasm_v128_load(tx + 8);
|
| 846 |
+
x3 = wasm_v128_load(tx + 12);
|
| 847 |
+
|
| 848 |
+
y0 = wasm_v128_load(ty + 0);
|
| 849 |
+
y1 = wasm_v128_load(ty + 4);
|
| 850 |
+
y2 = wasm_v128_load(ty + 8);
|
| 851 |
+
y3 = wasm_v128_load(ty + 12);
|
| 852 |
+
|
| 853 |
+
y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
|
| 854 |
+
y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
|
| 855 |
+
y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
|
| 856 |
+
y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));
|
| 857 |
+
|
| 858 |
+
wasm_v128_store(ty + 0, y0);
|
| 859 |
+
wasm_v128_store(ty + 4, y1);
|
| 860 |
+
wasm_v128_store(ty + 8, y2);
|
| 861 |
+
wasm_v128_store(ty + 12, y3);
|
| 862 |
+
|
| 863 |
+
for (int k = 0; k < 16; ++k) {
|
| 864 |
+
y[i + k] = ggml_fp32_to_fp16(ty[k]);
|
| 865 |
+
}
|
| 866 |
+
}
|
| 867 |
+
|
| 868 |
+
// leftovers
|
| 869 |
+
for (int i = n16; i < n; ++i) {
|
| 870 |
+
GGML_ASSERT(false);
|
| 871 |
+
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
|
| 872 |
+
}
|
| 873 |
#else
|
| 874 |
for (int i = 0; i < n; ++i) {
|
| 875 |
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
|