ggerganov commited on
Commit
ca4986c
·
1 Parent(s): 4abd756

wip : WASM 128-bit SIMD support

Browse files
CMakeLists.txt CHANGED
@@ -123,15 +123,15 @@ else()
123
  if (MSVC)
124
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2 /D_CRT_SECURE_NO_WARNINGS=1")
125
  else()
126
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
 
 
 
 
 
127
  endif()
128
  endif()
129
 
130
- if (EMSCRIPTEN)
131
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -msimd128")
132
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
133
- endif()
134
-
135
  # whisper - this is the main library of the project
136
 
137
  set(TARGET whisper)
 
123
  if (MSVC)
124
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2 /D_CRT_SECURE_NO_WARNINGS=1")
125
  else()
126
+ if (EMSCRIPTEN)
127
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -msimd128")
128
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
129
+ else()
130
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
131
+ endif()
132
  endif()
133
  endif()
134
 
 
 
 
 
 
135
  # whisper - this is the main library of the project
136
 
137
  set(TARGET whisper)
bindings/javascript/CMakeLists.txt CHANGED
@@ -21,13 +21,14 @@ if (WHISPER_WASM_SINGLE_FILE)
21
  )
22
  endif()
23
 
 
24
  set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
25
  --bind \
26
  -s MODULARIZE=1 \
27
  -s ASSERTIONS=1 \
28
  -s USE_PTHREADS=1 \
29
- -s PTHREAD_POOL_SIZE=8 \
30
- -s TOTAL_MEMORY=536870912 \
31
  -s FORCE_FILESYSTEM=1 \
32
  -s EXPORT_NAME=\"'whisper_factory'\" \
33
  ${EXTRA_FLAGS} \
 
21
  )
22
  endif()
23
 
24
+ #-s TOTAL_MEMORY=536870912 \
25
  set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
26
  --bind \
27
  -s MODULARIZE=1 \
28
  -s ASSERTIONS=1 \
29
  -s USE_PTHREADS=1 \
30
+ -s PTHREAD_POOL_SIZE=9 \
31
+ -s ALLOW_MEMORY_GROWTH=1 \
32
  -s FORCE_FILESYSTEM=1 \
33
  -s EXPORT_NAME=\"'whisper_factory'\" \
34
  ${EXTRA_FLAGS} \
bindings/javascript/emscripten.cpp CHANGED
@@ -4,6 +4,7 @@
4
  #include <emscripten/bind.h>
5
 
6
  #include <vector>
 
7
 
8
  std::vector<struct whisper_context *> g_contexts(4, nullptr);
9
 
@@ -47,7 +48,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
47
  params.print_special_tokens = false;
48
  params.translate = false;
49
  params.language = "en";
50
- params.n_threads = 4;
51
  params.offset_ms = 0;
52
 
53
  std::vector<float> pcmf32;
 
4
  #include <emscripten/bind.h>
5
 
6
  #include <vector>
7
+ #include <thread>
8
 
9
  std::vector<struct whisper_context *> g_contexts(4, nullptr);
10
 
 
48
  params.print_special_tokens = false;
49
  params.translate = false;
50
  params.language = "en";
51
+ params.n_threads = std::min(8, (int) std::thread::hardware_concurrency());
52
  params.offset_ms = 0;
53
 
54
  std::vector<float> pcmf32;
bindings/javascript/whisper.js CHANGED
The diff for this file is too large to render. See raw diff
 
ggml.c CHANGED
@@ -73,7 +73,11 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
73
 
74
  #else
75
 
 
 
 
76
  #include <immintrin.h>
 
77
 
78
  // FP16 <-> FP32
79
  // ref: https://github.com/Maratyszcza/FP16
@@ -288,7 +292,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
288
  sumf += x[i]*y[i];
289
  }
290
  #elif defined(__AVX2__)
291
- // AVX 256-bit (unroll 4)
292
  const int n32 = (n & ~31);
293
 
294
  __m256 sum0 = _mm256_setzero_ps();
@@ -330,6 +334,45 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
330
  for (int i = n32; i < n; ++i) {
331
  sumf += x[i]*y[i];
332
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  #else
334
  // scalar
335
  for (int i = 0; i < n; ++i) {
@@ -446,7 +489,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
446
  sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
447
  }
448
  #elif defined(__AVX2__)
449
- // AVX 256-bit (unroll 4)
450
  const int n32 = (n & ~31);
451
 
452
  __m256 sum0 = _mm256_setzero_ps();
@@ -489,6 +532,54 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
489
  //GGML_ASSERT(false);
490
  sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
491
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  #else
493
  for (int i = 0; i < n; ++i) {
494
  sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
@@ -535,7 +626,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
535
  y[i] += x[i]*v;
536
  }
537
  #elif defined(__AVX2__)
538
- // AVX 256-bit (unroll 4)
539
  const int n32 = (n & ~31);
540
 
541
  const __m256 v4 = _mm256_set1_ps(v);
@@ -569,6 +660,41 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
569
  for (int i = n32; i < n; ++i) {
570
  y[i] += x[i]*v;
571
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  #else
573
  // scalar
574
  for (int i = 0; i < n; ++i) {
@@ -696,6 +822,54 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
696
  GGML_ASSERT(false);
697
  y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
698
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699
  #else
700
  for (int i = 0; i < n; ++i) {
701
  y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
 
73
 
74
  #else
75
 
76
+ #ifdef __wasm_simd128__
77
+ #include <wasm_simd128.h>
78
+ #else
79
  #include <immintrin.h>
80
+ #endif
81
 
82
  // FP16 <-> FP32
83
  // ref: https://github.com/Maratyszcza/FP16
 
292
  sumf += x[i]*y[i];
293
  }
294
  #elif defined(__AVX2__)
295
+ // AVX 256-bit
296
  const int n32 = (n & ~31);
297
 
298
  __m256 sum0 = _mm256_setzero_ps();
 
334
  for (int i = n32; i < n; ++i) {
335
  sumf += x[i]*y[i];
336
  }
337
+ #elif defined(__wasm_simd128__)
338
+ // WASM 128-bit
339
+ const int n16 = (n & ~15);
340
+
341
+ v128_t sum0 = wasm_f32x4_splat(0);
342
+ v128_t sum1 = wasm_f32x4_splat(0);
343
+ v128_t sum2 = wasm_f32x4_splat(0);
344
+ v128_t sum3 = wasm_f32x4_splat(0);
345
+
346
+ v128_t x0, x1, x2, x3;
347
+ v128_t y0, y1, y2, y3;
348
+
349
+ for (int i = 0; i < n16; i += 16) {
350
+ x0 = wasm_v128_load(x + i + 0);
351
+ x1 = wasm_v128_load(x + i + 4);
352
+ x2 = wasm_v128_load(x + i + 8);
353
+ x3 = wasm_v128_load(x + i + 12);
354
+
355
+ y0 = wasm_v128_load(y + i + 0);
356
+ y1 = wasm_v128_load(y + i + 4);
357
+ y2 = wasm_v128_load(y + i + 8);
358
+ y3 = wasm_v128_load(y + i + 12);
359
+
360
+ sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
361
+ sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
362
+ sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
363
+ sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
364
+ }
365
+
366
+ sum0 = wasm_f32x4_add(sum0, sum1);
367
+ sum2 = wasm_f32x4_add(sum2, sum3);
368
+ sum0 = wasm_f32x4_add(sum0, sum2);
369
+
370
+ sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);
371
+
372
+ // leftovers
373
+ for (int i = n16; i < n; ++i) {
374
+ sumf += x[i]*y[i];
375
+ }
376
  #else
377
  // scalar
378
  for (int i = 0; i < n; ++i) {
 
489
  sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
490
  }
491
  #elif defined(__AVX2__)
492
+ // AVX 256-bit
493
  const int n32 = (n & ~31);
494
 
495
  __m256 sum0 = _mm256_setzero_ps();
 
532
  //GGML_ASSERT(false);
533
  sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
534
  }
535
+ #elif defined(__wasm_simd128__)
536
+ // WASM 128-bit
537
+ const int n16 = (n & ~15);
538
+
539
+ v128_t sum0 = wasm_f32x4_splat(0.0f);
540
+ v128_t sum1 = wasm_f32x4_splat(0.0f);
541
+ v128_t sum2 = wasm_f32x4_splat(0.0f);
542
+ v128_t sum3 = wasm_f32x4_splat(0.0f);
543
+
544
+ v128_t x0, x1, x2, x3;
545
+ v128_t y0, y1, y2, y3;
546
+
547
+ float tx[16];
548
+ float ty[16];
549
+
550
+ for (int i = 0; i < n16; i += 16) {
551
+ for (int k = 0; k < 16; ++k) {
552
+ tx[k] = ggml_fp16_to_fp32(x[i + k]);
553
+ ty[k] = ggml_fp16_to_fp32(y[i + k]);
554
+ }
555
+
556
+ x0 = wasm_v128_load(tx + 0);
557
+ x1 = wasm_v128_load(tx + 4);
558
+ x2 = wasm_v128_load(tx + 8);
559
+ x3 = wasm_v128_load(tx + 12);
560
+
561
+ y0 = wasm_v128_load(ty + 0);
562
+ y1 = wasm_v128_load(ty + 4);
563
+ y2 = wasm_v128_load(ty + 8);
564
+ y3 = wasm_v128_load(ty + 12);
565
+
566
+ sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
567
+ sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
568
+ sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
569
+ sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
570
+ }
571
+
572
+ sum0 = wasm_f32x4_add(sum0, sum1);
573
+ sum2 = wasm_f32x4_add(sum2, sum3);
574
+ sum0 = wasm_f32x4_add(sum0, sum2);
575
+
576
+ sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);
577
+
578
+ // leftovers
579
+ for (int i = n16; i < n; ++i) {
580
+ //GGML_ASSERT(false);
581
+ sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
582
+ }
583
  #else
584
  for (int i = 0; i < n; ++i) {
585
  sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
 
626
  y[i] += x[i]*v;
627
  }
628
  #elif defined(__AVX2__)
629
+ // AVX 256-bit
630
  const int n32 = (n & ~31);
631
 
632
  const __m256 v4 = _mm256_set1_ps(v);
 
660
  for (int i = n32; i < n; ++i) {
661
  y[i] += x[i]*v;
662
  }
663
+ #elif defined(__wasm_simd128__)
664
+ // WASM SIMD 128-bit
665
+ const int n16 = (n & ~15);
666
+
667
+ const v128_t v4 = wasm_f32x4_splat(v);
668
+
669
+ v128_t x0, x1, x2, x3;
670
+ v128_t y0, y1, y2, y3;
671
+
672
+ for (int i = 0; i < n16; i += 16) {
673
+ x0 = wasm_v128_load(x + i + 0);
674
+ x1 = wasm_v128_load(x + i + 4);
675
+ x2 = wasm_v128_load(x + i + 8);
676
+ x3 = wasm_v128_load(x + i + 12);
677
+
678
+ y0 = wasm_v128_load(y + i + 0);
679
+ y1 = wasm_v128_load(y + i + 4);
680
+ y2 = wasm_v128_load(y + i + 8);
681
+ y3 = wasm_v128_load(y + i + 12);
682
+
683
+ y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
684
+ y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
685
+ y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
686
+ y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));
687
+
688
+ wasm_v128_store(y + i + 0, y0);
689
+ wasm_v128_store(y + i + 4, y1);
690
+ wasm_v128_store(y + i + 8, y2);
691
+ wasm_v128_store(y + i + 12, y3);
692
+ }
693
+
694
+ // leftovers
695
+ for (int i = n16; i < n; ++i) {
696
+ y[i] += x[i]*v;
697
+ }
698
  #else
699
  // scalar
700
  for (int i = 0; i < n; ++i) {
 
822
  GGML_ASSERT(false);
823
  y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
824
  }
825
+ #elif defined(__wasm_simd128__)
826
+ // WASM SIMD 128-bit
827
+ const int n16 = (n & ~15);
828
+
829
+ const v128_t v4 = wasm_f32x4_splat(v);
830
+
831
+ v128_t x0, x1, x2, x3;
832
+ v128_t y0, y1, y2, y3;
833
+
834
+ float tx[16];
835
+ float ty[16];
836
+
837
+ for (int i = 0; i < n16; i += 16) {
838
+ for (int k = 0; k < 16; ++k) {
839
+ tx[k] = ggml_fp16_to_fp32(x[i + k]);
840
+ ty[k] = ggml_fp16_to_fp32(y[i + k]);
841
+ }
842
+
843
+ x0 = wasm_v128_load(tx + 0);
844
+ x1 = wasm_v128_load(tx + 4);
845
+ x2 = wasm_v128_load(tx + 8);
846
+ x3 = wasm_v128_load(tx + 12);
847
+
848
+ y0 = wasm_v128_load(ty + 0);
849
+ y1 = wasm_v128_load(ty + 4);
850
+ y2 = wasm_v128_load(ty + 8);
851
+ y3 = wasm_v128_load(ty + 12);
852
+
853
+ y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
854
+ y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
855
+ y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
856
+ y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));
857
+
858
+ wasm_v128_store(ty + 0, y0);
859
+ wasm_v128_store(ty + 4, y1);
860
+ wasm_v128_store(ty + 8, y2);
861
+ wasm_v128_store(ty + 12, y3);
862
+
863
+ for (int k = 0; k < 16; ++k) {
864
+ y[i + k] = ggml_fp32_to_fp16(ty[k]);
865
+ }
866
+ }
867
+
868
+ // leftovers
869
+ for (int i = n16; i < n; ++i) {
870
+ GGML_ASSERT(false);
871
+ y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
872
+ }
873
  #else
874
  for (int i = 0; i < n; ++i) {
875
  y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);