ggerganov commited on
Commit
6fe850c
·
unverified ·
1 Parent(s): c92d0b9

ggml : simplify the SIMD code (#324)

Browse files

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments

Files changed (3) hide show
  1. ggml.c +489 -759
  2. ggml.h +1 -0
  3. whisper.cpp +1 -0
ggml.c CHANGED
@@ -316,192 +316,426 @@ int64_t ggml_cycles_per_ms(void) {
316
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
317
 
318
  //
319
- // fundamental operations
320
  //
321
 
322
- inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
323
-
324
- inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
325
-
326
- inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
327
-
328
- inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
329
-
330
- inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
331
- inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
332
- inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
333
- inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
334
- inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
335
- inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
336
- inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
337
- inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
338
- inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
339
 
340
- inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
341
- ggml_float sumf = 0.0;
342
  #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
343
- // NEON 128-bit
344
- const int n16 = (n & ~15);
345
-
346
- float32x4_t sum0 = vdupq_n_f32(0);
347
- float32x4_t sum1 = vdupq_n_f32(0);
348
- float32x4_t sum2 = vdupq_n_f32(0);
349
- float32x4_t sum3 = vdupq_n_f32(0);
350
 
351
- float32x4_t x0, x1, x2, x3;
352
- float32x4_t y0, y1, y2, y3;
353
 
354
- for (int i = 0; i < n16; i += 16) {
355
- x0 = vld1q_f32(x + i + 0);
356
- x1 = vld1q_f32(x + i + 4);
357
- x2 = vld1q_f32(x + i + 8);
358
- x3 = vld1q_f32(x + i + 12);
359
 
360
- y0 = vld1q_f32(y + i + 0);
361
- y1 = vld1q_f32(y + i + 4);
362
- y2 = vld1q_f32(y + i + 8);
363
- y3 = vld1q_f32(y + i + 12);
364
-
365
- sum0 = vfmaq_f32(sum0, x0, y0);
366
- sum1 = vfmaq_f32(sum1, x1, y1);
367
- sum2 = vfmaq_f32(sum2, x2, y2);
368
- sum3 = vfmaq_f32(sum3, x3, y3);
369
- }
370
 
371
- // reduce sum0..sum3 to sum0
372
- sum0 = vaddq_f32(sum0, sum1);
373
- sum2 = vaddq_f32(sum2, sum3);
374
- sum0 = vaddq_f32(sum0, sum2);
375
-
376
- sumf = vaddvq_f32(sum0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
- // leftovers
379
- for (int i = n16; i < n; ++i) {
380
- sumf += x[i]*y[i];
381
- }
382
- #elif defined(__AVX2__)
383
- // AVX 256-bit
384
- const int n32 = (n & ~31);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
 
386
- __m256 sum0 = _mm256_setzero_ps();
387
- __m256 sum1 = _mm256_setzero_ps();
388
- __m256 sum2 = _mm256_setzero_ps();
389
- __m256 sum3 = _mm256_setzero_ps();
390
 
391
- __m256 x0, x1, x2, x3;
392
- __m256 y0, y1, y2, y3;
393
 
394
- for (int i = 0; i < n32; i += 32) {
395
- x0 = _mm256_loadu_ps(x + i + 0);
396
- x1 = _mm256_loadu_ps(x + i + 8);
397
- x2 = _mm256_loadu_ps(x + i + 16);
398
- x3 = _mm256_loadu_ps(x + i + 24);
399
 
400
- y0 = _mm256_loadu_ps(y + i + 0);
401
- y1 = _mm256_loadu_ps(y + i + 8);
402
- y2 = _mm256_loadu_ps(y + i + 16);
403
- y3 = _mm256_loadu_ps(y + i + 24);
404
 
405
- sum0 = _mm256_fmadd_ps(x0, y0, sum0);
406
- sum1 = _mm256_fmadd_ps(x1, y1, sum1);
407
- sum2 = _mm256_fmadd_ps(x2, y2, sum2);
408
- sum3 = _mm256_fmadd_ps(x3, y3, sum3);
409
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
- sum0 = _mm256_add_ps(sum0, sum1);
412
- sum2 = _mm256_add_ps(sum2, sum3);
413
- sum0 = _mm256_add_ps(sum0, sum2);
414
 
415
- const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0), _mm256_extractf128_ps(sum0, 1));
416
- const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
417
- const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
- sumf = _mm_cvtss_f32(r1);
420
 
421
- // leftovers
422
- for (int i = n32; i < n; ++i) {
423
- sumf += x[i]*y[i];
424
- }
425
- #elif defined(__AVX__)
426
- // AVX 256-bit
427
- const int n32 = (n & ~31);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
- __m256 sum0 = _mm256_setzero_ps();
430
- __m256 sum1 = _mm256_setzero_ps();
431
- __m256 sum2 = _mm256_setzero_ps();
432
- __m256 sum3 = _mm256_setzero_ps();
433
 
434
- __m256 x0, x1, x2, x3;
435
- __m256 y0, y1, y2, y3;
 
 
 
 
436
 
437
- for (int i = 0; i < n32; i += 32) {
438
- x0 = _mm256_loadu_ps(x + i + 0);
439
- x1 = _mm256_loadu_ps(x + i + 8);
440
- x2 = _mm256_loadu_ps(x + i + 16);
441
- x3 = _mm256_loadu_ps(x + i + 24);
442
 
443
- y0 = _mm256_loadu_ps(y + i + 0);
444
- y1 = _mm256_loadu_ps(y + i + 8);
445
- y2 = _mm256_loadu_ps(y + i + 16);
446
- y3 = _mm256_loadu_ps(y + i + 24);
447
 
448
- sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0);
449
- sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1);
450
- sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2);
451
- sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3);
452
- }
453
 
454
- sum0 = _mm256_add_ps(sum0, sum1);
455
- sum2 = _mm256_add_ps(sum2, sum3);
456
- sum0 = _mm256_add_ps(sum0, sum2);
457
 
458
- const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0), _mm256_extractf128_ps(sum0, 1));
459
- const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
460
- const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
461
 
462
- sumf = _mm_cvtss_f32(r1);
 
 
 
 
 
 
 
 
463
 
464
- // leftovers
465
- for (int i = n32; i < n; ++i) {
466
- sumf += x[i]*y[i];
467
- }
468
- #elif defined(__wasm_simd128__)
469
- // WASM 128-bit
470
- const int n16 = (n & ~15);
471
 
472
- v128_t sum0 = wasm_f32x4_splat(0);
473
- v128_t sum1 = wasm_f32x4_splat(0);
474
- v128_t sum2 = wasm_f32x4_splat(0);
475
- v128_t sum3 = wasm_f32x4_splat(0);
476
 
477
- v128_t x0, x1, x2, x3;
478
- v128_t y0, y1, y2, y3;
479
 
480
- for (int i = 0; i < n16; i += 16) {
481
- x0 = wasm_v128_load(x + i + 0);
482
- x1 = wasm_v128_load(x + i + 4);
483
- x2 = wasm_v128_load(x + i + 8);
484
- x3 = wasm_v128_load(x + i + 12);
485
 
486
- y0 = wasm_v128_load(y + i + 0);
487
- y1 = wasm_v128_load(y + i + 4);
488
- y2 = wasm_v128_load(y + i + 8);
489
- y3 = wasm_v128_load(y + i + 12);
490
 
491
- sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
492
- sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
493
- sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
494
- sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
495
  }
496
 
497
- sum0 = wasm_f32x4_add(sum0, sum1);
498
- sum2 = wasm_f32x4_add(sum2, sum3);
499
- sum0 = wasm_f32x4_add(sum0, sum2);
500
-
501
- sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);
502
 
503
  // leftovers
504
- for (int i = n16; i < n; ++i) {
505
  sumf += x[i]*y[i];
506
  }
507
  #else
@@ -516,194 +750,34 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
516
 
517
  inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
518
  ggml_float sumf = 0.0;
519
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
520
- const int n32 = (n & ~31);
521
 
522
- #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
523
- float16x8_t sum0 = vdupq_n_f16(0);
524
- float16x8_t sum1 = vdupq_n_f16(0);
525
- float16x8_t sum2 = vdupq_n_f16(0);
526
- float16x8_t sum3 = vdupq_n_f16(0);
527
 
528
- float16x8_t x0, x1, x2, x3;
529
- float16x8_t y0, y1, y2, y3;
530
 
531
- for (int i = 0; i < n32; i += 32) {
532
- x0 = vld1q_f16(x + i + 0 );
533
- x1 = vld1q_f16(x + i + 8 );
534
- x2 = vld1q_f16(x + i + 16);
535
- x3 = vld1q_f16(x + i + 24);
536
 
537
- y0 = vld1q_f16(y + i + 0 );
538
- y1 = vld1q_f16(y + i + 8 );
539
- y2 = vld1q_f16(y + i + 16);
540
- y3 = vld1q_f16(y + i + 24);
541
 
542
- sum0 = vfmaq_f16(sum0, x0, y0);
543
- sum1 = vfmaq_f16(sum1, x1, y1);
544
- sum2 = vfmaq_f16(sum2, x2, y2);
545
- sum3 = vfmaq_f16(sum3, x3, y3);
546
  }
547
 
548
  // reduce sum0..sum3 to sum0
549
- sum0 = vaddq_f16(sum0, sum1);
550
- sum2 = vaddq_f16(sum2, sum3);
551
- sum0 = vaddq_f16(sum0, sum2);
552
-
553
- // load sum0 into 2 float32x4_t
554
- float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0));
555
- float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0));
556
-
557
- // reduce sum0f32 and sum1f32 to sumf
558
- sum0f32 = vaddq_f32(sum0f32, sum1f32);
559
- sumf = vaddvq_f32(sum0f32);
560
- #else
561
- float32x4_t sum0 = vdupq_n_f32(0);
562
- float32x4_t sum1 = vdupq_n_f32(0);
563
- float32x4_t sum2 = vdupq_n_f32(0);
564
- float32x4_t sum3 = vdupq_n_f32(0);
565
- float32x4_t sum4 = vdupq_n_f32(0);
566
- float32x4_t sum5 = vdupq_n_f32(0);
567
- float32x4_t sum6 = vdupq_n_f32(0);
568
- float32x4_t sum7 = vdupq_n_f32(0);
569
-
570
- float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
571
- float32x4_t y0, y1, y2, y3, y4, y5, y6, y7;
572
-
573
- for (int i = 0; i < n32; i += 32) {
574
- x0 = vcvt_f32_f16(vld1_f16(x + i + 0 ));
575
- x1 = vcvt_f32_f16(vld1_f16(x + i + 4 ));
576
- x2 = vcvt_f32_f16(vld1_f16(x + i + 8 ));
577
- x3 = vcvt_f32_f16(vld1_f16(x + i + 12));
578
- x4 = vcvt_f32_f16(vld1_f16(x + i + 16));
579
- x5 = vcvt_f32_f16(vld1_f16(x + i + 20));
580
- x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
581
- x7 = vcvt_f32_f16(vld1_f16(x + i + 28));
582
-
583
- y0 = vcvt_f32_f16(vld1_f16(y + i + 0 ));
584
- y1 = vcvt_f32_f16(vld1_f16(y + i + 4 ));
585
- y2 = vcvt_f32_f16(vld1_f16(y + i + 8 ));
586
- y3 = vcvt_f32_f16(vld1_f16(y + i + 12));
587
- y4 = vcvt_f32_f16(vld1_f16(y + i + 16));
588
- y5 = vcvt_f32_f16(vld1_f16(y + i + 20));
589
- y6 = vcvt_f32_f16(vld1_f16(y + i + 24));
590
- y7 = vcvt_f32_f16(vld1_f16(y + i + 28));
591
-
592
- sum0 = vfmaq_f32(sum0, x0, y0);
593
- sum1 = vfmaq_f32(sum1, x1, y1);
594
- sum2 = vfmaq_f32(sum2, x2, y2);
595
- sum3 = vfmaq_f32(sum3, x3, y3);
596
- sum4 = vfmaq_f32(sum4, x4, y4);
597
- sum5 = vfmaq_f32(sum5, x5, y5);
598
- sum6 = vfmaq_f32(sum6, x6, y6);
599
- sum7 = vfmaq_f32(sum7, x7, y7);
600
- }
601
-
602
- // reduce sum0..sum7 to sum0
603
- sum0 = vaddq_f32(sum0, sum1);
604
- sum2 = vaddq_f32(sum2, sum3);
605
- sum4 = vaddq_f32(sum4, sum5);
606
- sum6 = vaddq_f32(sum6, sum7);
607
- sum0 = vaddq_f32(sum0, sum2);
608
- sum4 = vaddq_f32(sum4, sum6);
609
- sum0 = vaddq_f32(sum0, sum4);
610
-
611
- sumf = vaddvq_f32(sum0);
612
- #endif
613
 
614
  // leftovers
615
- for (int i = n32; i < n; ++i) {
616
- sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
617
- }
618
- #elif defined(__AVX2__)
619
- // AVX 256-bit
620
- const int n32 = (n & ~31);
621
-
622
- __m256 sum0 = _mm256_setzero_ps();
623
- __m256 sum1 = _mm256_setzero_ps();
624
- __m256 sum2 = _mm256_setzero_ps();
625
- __m256 sum3 = _mm256_setzero_ps();
626
-
627
- __m256 x0, x1, x2, x3;
628
- __m256 y0, y1, y2, y3;
629
-
630
- for (int i = 0; i < n32; i += 32) {
631
- x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
632
- x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
633
- x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
634
- x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
635
-
636
- y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
637
- y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
638
- y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
639
- y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
640
-
641
- sum0 = _mm256_fmadd_ps(x0, y0, sum0);
642
- sum1 = _mm256_fmadd_ps(x1, y1, sum1);
643
- sum2 = _mm256_fmadd_ps(x2, y2, sum2);
644
- sum3 = _mm256_fmadd_ps(x3, y3, sum3);
645
- }
646
-
647
- const __m256 sum01 = _mm256_add_ps(sum0, sum1);
648
- const __m256 sum23 = _mm256_add_ps(sum2, sum3);
649
- const __m256 sum0123 = _mm256_add_ps(sum01, sum23);
650
-
651
- const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1));
652
- const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
653
- const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
654
-
655
- sumf = _mm_cvtss_f32(r1);
656
-
657
- // leftovers
658
- for (int i = n32; i < n; ++i) {
659
- //GGML_ASSERT(false);
660
- sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
661
- }
662
- #elif defined(__AVX__)
663
- // AVX 256-bit
664
- const int n32 = (n & ~31);
665
-
666
- __m256 sum0 = _mm256_setzero_ps();
667
- __m256 sum1 = _mm256_setzero_ps();
668
- __m256 sum2 = _mm256_setzero_ps();
669
- __m256 sum3 = _mm256_setzero_ps();
670
-
671
- __m256 x0, x1, x2, x3;
672
- __m256 y0, y1, y2, y3;
673
-
674
- for (int i = 0; i < n32; i += 32) {
675
- x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
676
- x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
677
- x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
678
- x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
679
-
680
- y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
681
- y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
682
- y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
683
- y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
684
-
685
- sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0);
686
- sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1);
687
- sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2);
688
- sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3);
689
- }
690
-
691
- const __m256 sum01 = _mm256_add_ps(sum0, sum1);
692
- const __m256 sum23 = _mm256_add_ps(sum2, sum3);
693
- const __m256 sum0123 = _mm256_add_ps(sum01, sum23);
694
-
695
- const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1));
696
- const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
697
- const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
698
-
699
- sumf = _mm_cvtss_f32(r1);
700
-
701
- // leftovers
702
- for (int i = n32; i < n; ++i) {
703
- //GGML_ASSERT(false);
704
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
705
  }
706
  #elif defined(__POWER9_VECTOR__)
 
 
707
  const int n32 = (n & ~31);
708
 
709
  vector float sum0 = vec_splats (0.0f);
@@ -754,54 +828,6 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
754
  for (int i = n32; i < n; ++i) {
755
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
756
  }
757
- #elif defined(__wasm_simd128__)
758
- // WASM 128-bit
759
- const int n16 = (n & ~15);
760
-
761
- v128_t sum0 = wasm_f32x4_splat(0.0f);
762
- v128_t sum1 = wasm_f32x4_splat(0.0f);
763
- v128_t sum2 = wasm_f32x4_splat(0.0f);
764
- v128_t sum3 = wasm_f32x4_splat(0.0f);
765
-
766
- v128_t x0, x1, x2, x3;
767
- v128_t y0, y1, y2, y3;
768
-
769
- float tx[16];
770
- float ty[16];
771
-
772
- for (int i = 0; i < n16; i += 16) {
773
- for (int k = 0; k < 16; ++k) {
774
- tx[k] = GGML_FP16_TO_FP32(x[i + k]);
775
- ty[k] = GGML_FP16_TO_FP32(y[i + k]);
776
- }
777
-
778
- x0 = wasm_v128_load(tx + 0);
779
- x1 = wasm_v128_load(tx + 4);
780
- x2 = wasm_v128_load(tx + 8);
781
- x3 = wasm_v128_load(tx + 12);
782
-
783
- y0 = wasm_v128_load(ty + 0);
784
- y1 = wasm_v128_load(ty + 4);
785
- y2 = wasm_v128_load(ty + 8);
786
- y3 = wasm_v128_load(ty + 12);
787
-
788
- sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
789
- sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
790
- sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
791
- sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
792
- }
793
-
794
- sum0 = wasm_f32x4_add(sum0, sum1);
795
- sum2 = wasm_f32x4_add(sum2, sum3);
796
- sum0 = wasm_f32x4_add(sum0, sum2);
797
-
798
- sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);
799
-
800
- // leftovers
801
- for (int i = n16; i < n; ++i) {
802
- //GGML_ASSERT(false);
803
- sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
804
- }
805
  #else
806
  for (int i = 0; i < n; ++i) {
807
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
@@ -812,144 +838,26 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
812
  }
813
 
814
  inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
815
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
816
- // NEON 128-bit
817
- const int n16 = (n & ~15);
818
-
819
- const float32x4_t v4 = vdupq_n_f32(v);
820
-
821
- float32x4_t x0, x1, x2, x3;
822
- float32x4_t y0, y1, y2, y3;
823
-
824
- for (int i = 0; i < n16; i += 16) {
825
- x0 = vld1q_f32(x + i + 0);
826
- x1 = vld1q_f32(x + i + 4);
827
- x2 = vld1q_f32(x + i + 8);
828
- x3 = vld1q_f32(x + i + 12);
829
-
830
- y0 = vld1q_f32(y + i + 0);
831
- y1 = vld1q_f32(y + i + 4);
832
- y2 = vld1q_f32(y + i + 8);
833
- y3 = vld1q_f32(y + i + 12);
834
-
835
- y0 = vfmaq_f32(y0, x0, v4);
836
- y1 = vfmaq_f32(y1, x1, v4);
837
- y2 = vfmaq_f32(y2, x2, v4);
838
- y3 = vfmaq_f32(y3, x3, v4);
839
-
840
- vst1q_f32(y + i + 0, y0);
841
- vst1q_f32(y + i + 4, y1);
842
- vst1q_f32(y + i + 8, y2);
843
- vst1q_f32(y + i + 12, y3);
844
- }
845
-
846
- // leftovers
847
- for (int i = n16; i < n; ++i) {
848
- y[i] += x[i]*v;
849
- }
850
- #elif defined(__AVX2__)
851
- // AVX 256-bit
852
- const int n32 = (n & ~31);
853
-
854
- const __m256 v4 = _mm256_set1_ps(v);
855
 
856
- __m256 x0, x1, x2, x3;
857
- __m256 y0, y1, y2, y3;
858
-
859
- for (int i = 0; i < n32; i += 32) {
860
- x0 = _mm256_loadu_ps(x + i + 0);
861
- x1 = _mm256_loadu_ps(x + i + 8);
862
- x2 = _mm256_loadu_ps(x + i + 16);
863
- x3 = _mm256_loadu_ps(x + i + 24);
864
 
865
- y0 = _mm256_loadu_ps(y + i + 0);
866
- y1 = _mm256_loadu_ps(y + i + 8);
867
- y2 = _mm256_loadu_ps(y + i + 16);
868
- y3 = _mm256_loadu_ps(y + i + 24);
869
 
870
- y0 = _mm256_fmadd_ps(x0, v4, y0);
871
- y1 = _mm256_fmadd_ps(x1, v4, y1);
872
- y2 = _mm256_fmadd_ps(x2, v4, y2);
873
- y3 = _mm256_fmadd_ps(x3, v4, y3);
 
874
 
875
- _mm256_storeu_ps(y + i + 0, y0);
876
- _mm256_storeu_ps(y + i + 8, y1);
877
- _mm256_storeu_ps(y + i + 16, y2);
878
- _mm256_storeu_ps(y + i + 24, y3);
879
- }
880
-
881
- // leftovers
882
- for (int i = n32; i < n; ++i) {
883
- y[i] += x[i]*v;
884
- }
885
- #elif defined(__AVX__)
886
- // AVX 256-bit
887
- const int n32 = (n & ~31);
888
-
889
- const __m256 v4 = _mm256_set1_ps(v);
890
-
891
- __m256 x0, x1, x2, x3;
892
- __m256 y0, y1, y2, y3;
893
-
894
- for (int i = 0; i < n32; i += 32) {
895
- x0 = _mm256_loadu_ps(x + i + 0);
896
- x1 = _mm256_loadu_ps(x + i + 8);
897
- x2 = _mm256_loadu_ps(x + i + 16);
898
- x3 = _mm256_loadu_ps(x + i + 24);
899
-
900
- y0 = _mm256_loadu_ps(y + i + 0);
901
- y1 = _mm256_loadu_ps(y + i + 8);
902
- y2 = _mm256_loadu_ps(y + i + 16);
903
- y3 = _mm256_loadu_ps(y + i + 24);
904
-
905
- y0 = _mm256_add_ps(_mm256_mul_ps(x0, v4), y0);
906
- y1 = _mm256_add_ps(_mm256_mul_ps(x1, v4), y1);
907
- y2 = _mm256_add_ps(_mm256_mul_ps(x2, v4), y2);
908
- y3 = _mm256_add_ps(_mm256_mul_ps(x3, v4), y3);
909
-
910
- _mm256_storeu_ps(y + i + 0, y0);
911
- _mm256_storeu_ps(y + i + 8, y1);
912
- _mm256_storeu_ps(y + i + 16, y2);
913
- _mm256_storeu_ps(y + i + 24, y3);
914
- }
915
-
916
- // leftovers
917
- for (int i = n32; i < n; ++i) {
918
- y[i] += x[i]*v;
919
- }
920
- #elif defined(__wasm_simd128__)
921
- // WASM SIMD 128-bit
922
- const int n16 = (n & ~15);
923
-
924
- const v128_t v4 = wasm_f32x4_splat(v);
925
-
926
- v128_t x0, x1, x2, x3;
927
- v128_t y0, y1, y2, y3;
928
-
929
- for (int i = 0; i < n16; i += 16) {
930
- x0 = wasm_v128_load(x + i + 0);
931
- x1 = wasm_v128_load(x + i + 4);
932
- x2 = wasm_v128_load(x + i + 8);
933
- x3 = wasm_v128_load(x + i + 12);
934
-
935
- y0 = wasm_v128_load(y + i + 0);
936
- y1 = wasm_v128_load(y + i + 4);
937
- y2 = wasm_v128_load(y + i + 8);
938
- y3 = wasm_v128_load(y + i + 12);
939
-
940
- y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
941
- y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
942
- y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
943
- y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));
944
-
945
- wasm_v128_store(y + i + 0, y0);
946
- wasm_v128_store(y + i + 4, y1);
947
- wasm_v128_store(y + i + 8, y2);
948
- wasm_v128_store(y + i + 12, y3);
949
  }
950
 
951
  // leftovers
952
- for (int i = n16; i < n; ++i) {
953
  y[i] += x[i]*v;
954
  }
955
  #else
@@ -961,263 +869,86 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
961
  }
962
 
963
  inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) {
964
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
965
- // NEON 128-bit
966
- const int n32 = (n & ~31);
967
-
968
- #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
969
- const float16x8_t v8 = vdupq_n_f16(v);
970
 
971
- float16x8_t x0, x1, x2, x3;
972
- float16x8_t y0, y1, y2, y3;
973
 
974
- for (int i = 0; i < n32; i += 32) {
975
- y0 = vld1q_f16(y + i + 0 );
976
- y1 = vld1q_f16(y + i + 8 );
977
- y2 = vld1q_f16(y + i + 16);
978
- y3 = vld1q_f16(y + i + 24);
979
-
980
- x0 = vld1q_f16(x + i + 0 );
981
- x1 = vld1q_f16(x + i + 8 );
982
- x2 = vld1q_f16(x + i + 16);
983
- x3 = vld1q_f16(x + i + 24);
984
-
985
- y0 = vfmaq_f16(y0, x0, v8);
986
- y1 = vfmaq_f16(y1, x1, v8);
987
- y2 = vfmaq_f16(y2, x2, v8);
988
- y3 = vfmaq_f16(y3, x3, v8);
989
-
990
- vst1q_f16(y + i + 0 , y0);
991
- vst1q_f16(y + i + 8 , y1);
992
- vst1q_f16(y + i + 16, y2);
993
- vst1q_f16(y + i + 24, y3);
994
- }
995
- #else
996
- const float32x4_t v40 = vdupq_n_f32(v);
997
- const float32x4_t v41 = vdupq_n_f32(v);
998
 
999
- float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
1000
- float32x4_t y0, y1, y2, y3, y4, y5, y6, y7;
 
 
 
1001
 
1002
- for (int i = 0; i < n32; i += 32) {
1003
- y0 = vcvt_f32_f16(vld1_f16(y + i + 0 ));
1004
- y1 = vcvt_f32_f16(vld1_f16(y + i + 4 ));
1005
- y2 = vcvt_f32_f16(vld1_f16(y + i + 8 ));
1006
- y3 = vcvt_f32_f16(vld1_f16(y + i + 12));
1007
- y4 = vcvt_f32_f16(vld1_f16(y + i + 16));
1008
- y5 = vcvt_f32_f16(vld1_f16(y + i + 20));
1009
- y6 = vcvt_f32_f16(vld1_f16(y + i + 24));
1010
- y7 = vcvt_f32_f16(vld1_f16(y + i + 28));
1011
-
1012
- x0 = vcvt_f32_f16(vld1_f16(x + i + 0 ));
1013
- x1 = vcvt_f32_f16(vld1_f16(x + i + 4 ));
1014
- x2 = vcvt_f32_f16(vld1_f16(x + i + 8 ));
1015
- x3 = vcvt_f32_f16(vld1_f16(x + i + 12));
1016
- x4 = vcvt_f32_f16(vld1_f16(x + i + 16));
1017
- x5 = vcvt_f32_f16(vld1_f16(x + i + 20));
1018
- x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
1019
- x7 = vcvt_f32_f16(vld1_f16(x + i + 28));
1020
-
1021
- y0 = vfmaq_f32(y0, x0, v40);
1022
- y1 = vfmaq_f32(y1, x1, v40);
1023
- y2 = vfmaq_f32(y2, x2, v40);
1024
- y3 = vfmaq_f32(y3, x3, v40);
1025
- y4 = vfmaq_f32(y4, x4, v41);
1026
- y5 = vfmaq_f32(y5, x5, v41);
1027
- y6 = vfmaq_f32(y6, x6, v41);
1028
- y7 = vfmaq_f32(y7, x7, v41);
1029
-
1030
- vst1_f16(y + i + 0 , vcvt_f16_f32(y0));
1031
- vst1_f16(y + i + 4 , vcvt_f16_f32(y1));
1032
- vst1_f16(y + i + 8 , vcvt_f16_f32(y2));
1033
- vst1_f16(y + i + 12, vcvt_f16_f32(y3));
1034
- vst1_f16(y + i + 16, vcvt_f16_f32(y4));
1035
- vst1_f16(y + i + 20, vcvt_f16_f32(y5));
1036
- vst1_f16(y + i + 24, vcvt_f16_f32(y6));
1037
- vst1_f16(y + i + 28, vcvt_f16_f32(y7));
1038
  }
1039
- #endif
1040
 
1041
  // leftovers
1042
- for (int i = n32; i < n; ++i) {
1043
  GGML_ASSERT(false);
1044
  y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1045
  }
1046
- #elif defined(__AVX2__)
1047
- // AVX 256-bit
 
1048
  const int n32 = (n & ~31);
1049
-
1050
- const __m256 v8 = _mm256_set1_ps(v);
1051
-
1052
- __m256 x0, x1, x2, x3;
1053
- __m256 y0, y1, y2, y3;
1054
-
1055
  for (int i = 0; i < n32; i += 32) {
1056
- y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
1057
- y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
1058
- y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
1059
- y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
1060
-
1061
- x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
1062
- x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
1063
- x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
1064
- x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
1065
-
1066
- y0 = _mm256_fmadd_ps(x0, v8, y0);
1067
- y1 = _mm256_fmadd_ps(x1, v8, y1);
1068
- y2 = _mm256_fmadd_ps(x2, v8, y2);
1069
- y3 = _mm256_fmadd_ps(x3, v8, y3);
1070
-
1071
- _mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0));
1072
- _mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0));
1073
- _mm_storeu_si128((__m128i*)(y + i + 16), _mm256_cvtps_ph(y2, 0));
1074
- _mm_storeu_si128((__m128i*)(y + i + 24), _mm256_cvtps_ph(y3, 0));
1075
- }
1076
 
1077
- // leftovers
1078
- for (int i = n32; i < n; ++i) {
1079
- GGML_ASSERT(false);
1080
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1081
- }
1082
- #elif defined(__AVX__)
1083
- // AVX 256-bit
1084
- const int n32 = (n & ~31);
1085
 
1086
- const __m256 v8 = _mm256_set1_ps(v);
1087
 
1088
- __m256 x0, x1, x2, x3;
1089
- __m256 y0, y1, y2, y3;
 
 
 
 
 
 
1090
 
1091
- for (int i = 0; i < n32; i += 32) {
1092
- y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
1093
- y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
1094
- y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
1095
- y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
 
 
 
1096
 
1097
- x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
1098
- x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
1099
- x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
1100
- x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
 
 
 
 
1101
 
1102
- y0 = _mm256_add_ps(_mm256_mul_ps(x0, v8), y0);
1103
- y1 = _mm256_add_ps(_mm256_mul_ps(x1, v8), y1);
1104
- y2 = _mm256_add_ps(_mm256_mul_ps(x2, v8), y2);
1105
- y3 = _mm256_add_ps(_mm256_mul_ps(x3, v8), y3);
1106
 
1107
- _mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0));
1108
- _mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0));
1109
- _mm_storeu_si128((__m128i*)(y + i + 16), _mm256_cvtps_ph(y2, 0));
1110
- _mm_storeu_si128((__m128i*)(y + i + 24), _mm256_cvtps_ph(y3, 0));
1111
  }
1112
 
1113
- // leftovers
1114
  for (int i = n32; i < n; ++i) {
1115
- GGML_ASSERT(false);
1116
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1117
- }
1118
- #elif defined(__POWER9_VECTOR__)
1119
- const int n32 = (n & ~31);
1120
- for (int i = 0; i < n32; i += 32) {
1121
- // Use vec_xl, not vec_ld, because x is sometimes unaligned!
1122
- vector unsigned short x0 = vec_xl(i * 2 + 0, x);
1123
- vector unsigned short x1 = vec_xl(i * 2 + 16, x);
1124
- vector unsigned short x2 = vec_xl(i * 2 + 32, x);
1125
- vector unsigned short x3 = vec_xl(i * 2 + 48, x);
1126
-
1127
- vector unsigned short y0 = vec_xl(i * 2 + 0, y);
1128
- vector unsigned short y1 = vec_xl(i * 2 + 16, y);
1129
- vector unsigned short y2 = vec_xl(i * 2 + 32, y);
1130
- vector unsigned short y3 = vec_xl(i * 2 + 48, y);
1131
-
1132
- vector float v4 = vec_splats(v);
1133
-
1134
- vector float fx0l = vec_extract_fp32_from_shortl(x0);
1135
- vector float fx0h = vec_extract_fp32_from_shorth(x0);
1136
- vector float fx1l = vec_extract_fp32_from_shortl(x1);
1137
- vector float fx1h = vec_extract_fp32_from_shorth(x1);
1138
- vector float fx2l = vec_extract_fp32_from_shortl(x2);
1139
- vector float fx2h = vec_extract_fp32_from_shorth(x2);
1140
- vector float fx3l = vec_extract_fp32_from_shortl(x3);
1141
- vector float fx3h = vec_extract_fp32_from_shorth(x3);
1142
-
1143
- vector float fy0l = vec_extract_fp32_from_shortl(y0);
1144
- vector float fy0h = vec_extract_fp32_from_shorth(y0);
1145
- vector float fy1l = vec_extract_fp32_from_shortl(y1);
1146
- vector float fy1h = vec_extract_fp32_from_shorth(y1);
1147
- vector float fy2l = vec_extract_fp32_from_shortl(y2);
1148
- vector float fy2h = vec_extract_fp32_from_shorth(y2);
1149
- vector float fy3l = vec_extract_fp32_from_shortl(y3);
1150
- vector float fy3h = vec_extract_fp32_from_shorth(y3);
1151
-
1152
- fy0l = vec_madd(fx0l, v4, fy0l);
1153
- fy0h = vec_madd(fx0h, v4, fy0h);
1154
- fy1l = vec_madd(fx1l, v4, fy1l);
1155
- fy1h = vec_madd(fx1h, v4, fy1h);
1156
- fy2l = vec_madd(fx2l, v4, fy2l);
1157
- fy2h = vec_madd(fx2h, v4, fy2h);
1158
- fy3l = vec_madd(fx3l, v4, fy3l);
1159
- fy3h = vec_madd(fx3h, v4, fy3h);
1160
-
1161
- y0 = vec_pack_to_short_fp32(fy0h, fy0l);
1162
- y1 = vec_pack_to_short_fp32(fy1h, fy1l);
1163
- y2 = vec_pack_to_short_fp32(fy2h, fy2l);
1164
- y3 = vec_pack_to_short_fp32(fy3h, fy3l);
1165
-
1166
- vec_xst(y0, i * 2 + 0, y);
1167
- vec_xst(y1, i * 2 + 16, y);
1168
- vec_xst(y2, i * 2 + 32, y);
1169
- vec_xst(y3, i * 2 + 48, y);
1170
- }
1171
-
1172
- for (int i = n32; i < n; ++i) {
1173
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1174
- }
1175
- #elif defined(__wasm_simd128__)
1176
- // WASM SIMD 128-bit
1177
- const int n16 = (n & ~15);
1178
-
1179
- const v128_t v4 = wasm_f32x4_splat(v);
1180
-
1181
- v128_t x0, x1, x2, x3;
1182
- v128_t y0, y1, y2, y3;
1183
-
1184
- float tx[16];
1185
- float ty[16];
1186
-
1187
- for (int i = 0; i < n16; i += 16) {
1188
- for (int k = 0; k < 16; ++k) {
1189
- tx[k] = GGML_FP16_TO_FP32(x[i + k]);
1190
- ty[k] = GGML_FP16_TO_FP32(y[i + k]);
1191
- }
1192
-
1193
- x0 = wasm_v128_load(tx + 0);
1194
- x1 = wasm_v128_load(tx + 4);
1195
- x2 = wasm_v128_load(tx + 8);
1196
- x3 = wasm_v128_load(tx + 12);
1197
-
1198
- y0 = wasm_v128_load(ty + 0);
1199
- y1 = wasm_v128_load(ty + 4);
1200
- y2 = wasm_v128_load(ty + 8);
1201
- y3 = wasm_v128_load(ty + 12);
1202
-
1203
- y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
1204
- y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
1205
- y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
1206
- y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));
1207
-
1208
- wasm_v128_store(ty + 0, y0);
1209
- wasm_v128_store(ty + 4, y1);
1210
- wasm_v128_store(ty + 8, y2);
1211
- wasm_v128_store(ty + 12, y3);
1212
-
1213
- for (int k = 0; k < 16; ++k) {
1214
- y[i + k] = GGML_FP32_TO_FP16(ty[k]);
1215
- }
1216
- }
1217
-
1218
- // leftovers
1219
- for (int i = n16; i < n; ++i) {
1220
- GGML_ASSERT(false);
1221
  y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1222
  }
1223
  #else
@@ -1229,33 +960,24 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
1229
 
1230
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
1231
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
1232
- #if defined(__AVX__) || defined(__AVX2__)
1233
- // AVX 256-bit
1234
- const int n32 = (n & ~31);
1235
 
1236
- const __m256 v4 = _mm256_set1_ps(v);
1237
 
1238
- __m256 y0, y1, y2, y3;
1239
-
1240
- for (int i = 0; i < n32; i += 32) {
1241
- y0 = _mm256_loadu_ps(y + i + 0);
1242
- y1 = _mm256_loadu_ps(y + i + 8);
1243
- y2 = _mm256_loadu_ps(y + i + 16);
1244
- y3 = _mm256_loadu_ps(y + i + 24);
1245
 
1246
- y0 = _mm256_mul_ps(y0, v4);
1247
- y1 = _mm256_mul_ps(y1, v4);
1248
- y2 = _mm256_mul_ps(y2, v4);
1249
- y3 = _mm256_mul_ps(y3, v4);
1250
 
1251
- _mm256_storeu_ps(y + i + 0, y0);
1252
- _mm256_storeu_ps(y + i + 8, y1);
1253
- _mm256_storeu_ps(y + i + 16, y2);
1254
- _mm256_storeu_ps(y + i + 24, y3);
1255
  }
1256
 
1257
  // leftovers
1258
- for (int i = n32; i < n; ++i) {
1259
  y[i] *= v;
1260
  }
1261
  #else
@@ -8533,6 +8255,14 @@ int ggml_cpu_has_avx512(void) {
8533
  #endif
8534
  }
8535
 
 
 
 
 
 
 
 
 
8536
  int ggml_cpu_has_neon(void) {
8537
  #if defined(__ARM_NEON)
8538
  return 1;
 
316
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
317
 
318
  //
319
+ // simd mappings
320
  //
321
 
322
+ // we define a common set of C macros which map to specific intrinsics based on the current architecture
323
+ // we then implement the fundamental computation operations below using only these macros
324
+ // adding support for new architectures requires to define the corresponding SIMD macros
325
+ //
326
+ // GGML_F32_STEP / GGML_F16_STEP
327
+ // number of elements to process in a single step
328
+ //
329
+ // GGML_F32_EPR / GGML_F16_EPR
330
+ // number of elements to fit in a single register
331
+ //
 
 
 
 
 
 
 
332
 
 
 
333
  #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 
 
 
 
 
 
 
334
 
335
+ #define GGML_SIMD
 
336
 
337
+ // F32 NEON
 
 
 
 
338
 
339
+ #define GGML_F32_STEP 16
340
+ #define GGML_F32_EPR 4
 
 
 
 
 
 
 
 
341
 
342
+ #define GGML_F32x4 float32x4_t
343
+ #define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
344
+ #define GGML_F32x4_SET1(x) vdupq_n_f32(x)
345
+ #define GGML_F32x4_LOAD vld1q_f32
346
+ #define GGML_F32x4_STORE vst1q_f32
347
+ #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
348
+ #define GGML_F32x4_ADD vaddq_f32
349
+ #define GGML_F32x4_MUL vmulq_f32
350
+ #if defined(__ARM_FEATURE_QRDMX)
351
+ #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
352
+ #else
353
+ #define GGML_F32x4_REDUCE_ONE(x) \
354
+ (vgetq_lane_f32(x, 0) + \
355
+ vgetq_lane_f32(x, 1) + \
356
+ vgetq_lane_f32(x, 2) + \
357
+ vgetq_lane_f32(x, 3))
358
+ #endif
359
+ #define GGML_F32x4_REDUCE(res, x) \
360
+ { \
361
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
362
+ x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
363
+ } \
364
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
365
+ x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
366
+ } \
367
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
368
+ x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
369
+ } \
370
+ res = GGML_F32x4_REDUCE_ONE(x[0]); \
371
+ }
372
+
373
+ #define GGML_F32_VEC GGML_F32x4
374
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
375
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
376
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
377
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
378
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
379
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
380
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
381
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
382
+
383
+ // F16 NEON
384
 
385
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
386
+ #define GGML_F16_STEP 32
387
+ #define GGML_F16_EPR 8
388
+
389
+ #define GGML_F16x8 float16x8_t
390
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
391
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
392
+ #define GGML_F16x8_LOAD vld1q_f16
393
+ #define GGML_F16x8_STORE vst1q_f16
394
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
395
+ #define GGML_F16x8_ADD vaddq_f16
396
+ #define GGML_F16x8_MUL vmulq_f16
397
+ #define GGML_F16x8_REDUCE(res, x) \
398
+ { \
399
+ for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
400
+ x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
401
+ } \
402
+ for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
403
+ x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
404
+ } \
405
+ for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
406
+ x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
407
+ } \
408
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
409
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
410
+ res = vaddvq_f32(vaddq_f32(t0, t1)); \
411
+ }
412
+
413
+ #define GGML_F16_VEC GGML_F16x8
414
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
415
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
416
+ #define GGML_F16_VEC_LOAD GGML_F16x8_LOAD
417
+ #define GGML_F16_VEC_STORE GGML_F16x8_STORE
418
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
419
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
420
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
421
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
422
+ #else
423
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
424
+ // and take advantage of the vcvt_ functions to convert to/from FP16
425
+
426
+ #define GGML_F16_STEP 16
427
+ #define GGML_F16_EPR 4
428
+
429
+ #define GGML_F32Cx4 float32x4_t
430
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
431
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
432
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
433
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
434
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
435
+ #define GGML_F32Cx4_ADD vaddq_f32
436
+ #define GGML_F32Cx4_MUL vmulq_f32
437
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
438
+
439
+ #define GGML_F16_VEC GGML_F32Cx4
440
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
441
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
442
+ #define GGML_F16_VEC_LOAD GGML_F32Cx4_LOAD
443
+ #define GGML_F16_VEC_STORE GGML_F32Cx4_STORE
444
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
445
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
446
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
447
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
448
+ #endif
449
 
450
+ #elif defined(__AVX__)
 
 
 
451
 
452
+ #define GGML_SIMD
 
453
 
454
+ // F32 AVX
 
 
 
 
455
 
456
+ #define GGML_F32_STEP 32
457
+ #define GGML_F32_EPR 8
 
 
458
 
459
+ #define GGML_F32x8 __m256
460
+ #define GGML_F32x8_ZERO _mm256_setzero_ps()
461
+ #define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
462
+ #define GGML_F32x8_LOAD _mm256_loadu_ps
463
+ #define GGML_F32x8_STORE _mm256_storeu_ps
464
+ #if defined(__FMA__)
465
+ #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
466
+ #else
467
+ #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
468
+ #endif
469
+ #define GGML_F32x8_ADD _mm256_add_ps
470
+ #define GGML_F32x8_MUL _mm256_mul_ps
471
+ #define GGML_F32x8_REDUCE(res, x) \
472
+ { \
473
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
474
+ x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
475
+ } \
476
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
477
+ x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
478
+ } \
479
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
480
+ x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
481
+ } \
482
+ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
483
+ _mm256_extractf128_ps(x[0], 1)); \
484
+ const __m128 t1 = _mm_hadd_ps(t0, t0); \
485
+ res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
486
+ }
487
+ // TODO: is this optimal ?
488
+
489
+ #define GGML_F32_VEC GGML_F32x8
490
+ #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
491
+ #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
492
+ #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
493
+ #define GGML_F32_VEC_STORE GGML_F32x8_STORE
494
+ #define GGML_F32_VEC_FMA GGML_F32x8_FMA
495
+ #define GGML_F32_VEC_ADD GGML_F32x8_ADD
496
+ #define GGML_F32_VEC_MUL GGML_F32x8_MUL
497
+ #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
498
+
499
+ // F16 AVX
500
+
501
+ #define GGML_F16_STEP 32
502
+ #define GGML_F16_EPR 8
503
+
504
+ // F16 arithmetic is not supported by AVX, so we use F32 instead
505
+ // we take advantage of the _mm256_cvt intrinsics to convert F16 <-> F32
506
+
507
+ #define GGML_F32Cx8 __m256
508
+ #define GGML_F32Cx8_ZERO _mm256_setzero_ps()
509
+ #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
510
+ #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
511
+ #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
512
+ #define GGML_F32Cx8_FMA GGML_F32x8_FMA
513
+ #define GGML_F32Cx8_ADD _mm256_add_ps
514
+ #define GGML_F32Cx8_MUL _mm256_mul_ps
515
+ #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
516
+
517
+ #define GGML_F16_VEC GGML_F32Cx8
518
+ #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
519
+ #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
520
+ #define GGML_F16_VEC_LOAD GGML_F32Cx8_LOAD
521
+ #define GGML_F16_VEC_STORE GGML_F32Cx8_STORE
522
+ #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
523
+ #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
524
+ #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
525
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
526
 
527
+ #elif defined(__POWER9_VECTOR__)
 
 
528
 
529
+ // TODO: uncomment this when it works
530
+ //#define GGML_SIMD
531
+
532
+ // F32 POWER9
533
+
534
+ #define GGML_F32_STEP 32
535
+ #define GGML_F32_EPR 8
536
+
537
+ // TODO: not tested !!
538
+ #define GGML_F32x4 __vector float
539
+ #define GGML_F32x4_ZERO (__vector float){0.0f, 0.0f, 0.0f, 0.0f}
540
+ #define GGML_F32x4_SET1(x) (__vector float){x, x, x, x}
541
+ #define GGML_F32x4_LOAD vec_vsx_ld
542
+ #define GGML_F32x4_STORE vec_vsx_st
543
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
544
+ #define GGML_F32x4_ADD vec_add
545
+ #define GGML_F32x4_MUL vec_mul
546
+ #define GGML_F32x4_REDUCE(res, x) \
547
+ { \
548
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
549
+ x[2*i] = vec_add(x[2*i], x[2*i+1]); \
550
+ } \
551
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
552
+ x[4*i] = vec_add(x[4*i], x[4*i+2]); \
553
+ } \
554
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
555
+ x[8*i] = vec_add(x[8*i], x[8*i+4]); \
556
+ } \
557
+ res = vec_extract(x[0], 0) + \
558
+ vec_extract(x[0], 1) + \
559
+ vec_extract(x[0], 2) + \
560
+ vec_extract(x[0], 3); \
561
+ }
562
+
563
+ #define GGML_F32_VEC GGML_F32x4
564
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
565
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
566
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
567
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
568
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
569
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
570
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
571
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
572
+
573
+ // F16 POWER9
574
+ // TODO: implement here
575
+ // ...
576
 
577
+ #elif defined(__wasm_simd128__)
578
 
579
+ #define GGML_SIMD
580
+
581
+ // F32 WASM
582
+
583
+ #define GGML_F32_STEP 16
584
+ #define GGML_F32_EPR 4
585
+
586
+ #define GGML_F32x4 v128_t
587
+ #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
588
+ #define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
589
+ #define GGML_F32x4_LOAD wasm_v128_load
590
+ #define GGML_F32x4_STORE wasm_v128_store
591
+ #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
592
+ #define GGML_F32x4_ADD wasm_f32x4_add
593
+ #define GGML_F32x4_MUL wasm_f32x4_mul
594
+ #define GGML_F32x4_REDUCE(res, x) \
595
+ { \
596
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
597
+ x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
598
+ } \
599
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
600
+ x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
601
+ } \
602
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
603
+ x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
604
+ } \
605
+ res = wasm_f32x4_extract_lane(x[0], 0) + \
606
+ wasm_f32x4_extract_lane(x[0], 1) + \
607
+ wasm_f32x4_extract_lane(x[0], 2) + \
608
+ wasm_f32x4_extract_lane(x[0], 3); \
609
+ }
610
+
611
+ #define GGML_F32_VEC GGML_F32x4
612
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
613
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
614
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
615
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
616
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
617
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
618
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
619
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
620
+
621
+ // F16 WASM
622
+
623
+ #define GGML_F16_STEP 16
624
+ #define GGML_F16_EPR 4
625
+
626
+ inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
627
+ float tmp[4];
628
+
629
+ tmp[0] = GGML_FP16_TO_FP32(p[0]);
630
+ tmp[1] = GGML_FP16_TO_FP32(p[1]);
631
+ tmp[2] = GGML_FP16_TO_FP32(p[2]);
632
+ tmp[3] = GGML_FP16_TO_FP32(p[3]);
633
+
634
+ return wasm_v128_load(tmp);
635
+ }
636
+
637
+ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
638
+ float tmp[4];
639
+
640
+ wasm_v128_store(tmp, x);
641
+
642
+ p[0] = GGML_FP32_TO_FP16(tmp[0]);
643
+ p[1] = GGML_FP32_TO_FP16(tmp[1]);
644
+ p[2] = GGML_FP32_TO_FP16(tmp[2]);
645
+ p[3] = GGML_FP32_TO_FP16(tmp[3]);
646
+ }
647
+
648
+ #define GGML_F16x4 v128_t
649
+ #define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
650
+ #define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
651
+ #define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
652
+ #define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
653
+ #define GGML_F16x4_FMA GGML_F32x4_FMA
654
+ #define GGML_F16x4_ADD wasm_f32x4_add
655
+ #define GGML_F16x4_MUL wasm_f32x4_mul
656
+ #define GGML_F16x4_REDUCE(res, x) \
657
+ { \
658
+ for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
659
+ x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
660
+ } \
661
+ for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
662
+ x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
663
+ } \
664
+ for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
665
+ x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
666
+ } \
667
+ res = wasm_f32x4_extract_lane(x[0], 0) + \
668
+ wasm_f32x4_extract_lane(x[0], 1) + \
669
+ wasm_f32x4_extract_lane(x[0], 2) + \
670
+ wasm_f32x4_extract_lane(x[0], 3); \
671
+ }
672
+
673
+ #define GGML_F16_VEC GGML_F16x4
674
+ #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
675
+ #define GGML_F16_VEC_SET1 GGML_F16x4_SET1
676
+ #define GGML_F16_VEC_LOAD GGML_F16x4_LOAD
677
+ #define GGML_F16_VEC_STORE GGML_F16x4_STORE
678
+ #define GGML_F16_VEC_FMA GGML_F16x4_FMA
679
+ #define GGML_F16_VEC_ADD GGML_F16x4_ADD
680
+ #define GGML_F16_VEC_MUL GGML_F16x4_MUL
681
+ #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
682
 
683
+ #endif
 
 
 
684
 
685
+ // GGML_F32_ARR / GGML_F16_ARR
686
+ // number of registers to use per step
687
+ #ifdef GGML_SIMD
688
+ #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
689
+ #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
690
+ #endif
691
 
692
+ //
693
+ // fundamental operations
694
+ //
 
 
695
 
696
+ inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 
 
 
697
 
698
+ inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 
 
 
 
699
 
700
+ inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 
 
701
 
702
+ inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 
 
703
 
704
+ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
705
+ inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
706
+ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
707
+ inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
708
+ inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
709
+ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
710
+ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
711
+ inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
712
+ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
713
 
714
+ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
715
+ ggml_float sumf = 0.0;
 
 
 
 
 
716
 
717
+ #ifdef GGML_SIMD
718
+ const int np = (n & ~(GGML_F32_STEP - 1));
 
 
719
 
720
+ GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
 
721
 
722
+ GGML_F32_VEC ax[GGML_F32_ARR];
723
+ GGML_F32_VEC ay[GGML_F32_ARR];
 
 
 
724
 
725
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
726
+ for (int j = 0; j < GGML_F32_ARR; j++) {
727
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
728
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
729
 
730
+ sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
731
+ }
 
 
732
  }
733
 
734
+ // reduce sum0..sum3 to sum0
735
+ GGML_F32_VEC_REDUCE(sumf, sum);
 
 
 
736
 
737
  // leftovers
738
+ for (int i = np; i < n; ++i) {
739
  sumf += x[i]*y[i];
740
  }
741
  #else
 
750
 
751
  inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
752
  ggml_float sumf = 0.0;
 
 
753
 
754
+ #if defined(GGML_SIMD)
755
+ const int np = (n & ~(GGML_F16_STEP - 1));
 
 
 
756
 
757
+ GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
 
758
 
759
+ GGML_F16_VEC ax[GGML_F16_ARR];
760
+ GGML_F16_VEC ay[GGML_F16_ARR];
 
 
 
761
 
762
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
763
+ for (int j = 0; j < GGML_F16_ARR; j++) {
764
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR);
765
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR);
766
 
767
+ sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
768
+ }
 
 
769
  }
770
 
771
  // reduce sum0..sum3 to sum0
772
+ GGML_F16_VEC_REDUCE(sumf, sum);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773
 
774
  // leftovers
775
+ for (int i = np; i < n; ++i) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
777
  }
778
  #elif defined(__POWER9_VECTOR__)
779
+ // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
780
+ // being able to test it. hoping someone with access to a POWER9 machine can help out here.
781
  const int n32 = (n & ~31);
782
 
783
  vector float sum0 = vec_splats (0.0f);
 
828
  for (int i = n32; i < n; ++i) {
829
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
830
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
831
  #else
832
  for (int i = 0; i < n; ++i) {
833
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
 
838
  }
839
 
840
  inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
841
+ #if defined(GGML_SIMD)
842
+ const int np = (n & ~(GGML_F32_STEP - 1));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
843
 
844
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
 
 
 
 
 
 
 
845
 
846
+ GGML_F32_VEC ax[GGML_F32_ARR];
847
+ GGML_F32_VEC ay[GGML_F32_ARR];
 
 
848
 
849
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
850
+ for (int j = 0; j < GGML_F32_ARR; j++) {
851
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
852
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
853
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
854
 
855
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
856
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857
  }
858
 
859
  // leftovers
860
+ for (int i = np; i < n; ++i) {
861
  y[i] += x[i]*v;
862
  }
863
  #else
 
869
  }
870
 
871
  inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) {
872
+ #if defined(GGML_SIMD)
873
+ const int np = (n & ~(GGML_F16_STEP - 1));
 
 
 
 
874
 
875
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
 
876
 
877
+ GGML_F16_VEC ax[GGML_F16_ARR];
878
+ GGML_F16_VEC ay[GGML_F16_ARR];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
879
 
880
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
881
+ for (int j = 0; j < GGML_F16_ARR; j++) {
882
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR);
883
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR);
884
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
885
 
886
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay[j]);
887
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888
  }
 
889
 
890
  // leftovers
891
+ for (int i = np; i < n; ++i) {
892
  GGML_ASSERT(false);
893
  y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
894
  }
895
+ #elif defined(__POWER9_VECTOR__)
896
+ // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
897
+ // being able to test it. hoping someone with access to a POWER9 machine can help out here.
898
  const int n32 = (n & ~31);
 
 
 
 
 
 
899
  for (int i = 0; i < n32; i += 32) {
900
+ // Use vec_xl, not vec_ld, because x is sometimes unaligned!
901
+ vector unsigned short x0 = vec_xl(i * 2 + 0, x);
902
+ vector unsigned short x1 = vec_xl(i * 2 + 16, x);
903
+ vector unsigned short x2 = vec_xl(i * 2 + 32, x);
904
+ vector unsigned short x3 = vec_xl(i * 2 + 48, x);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
905
 
906
+ vector unsigned short y0 = vec_xl(i * 2 + 0, y);
907
+ vector unsigned short y1 = vec_xl(i * 2 + 16, y);
908
+ vector unsigned short y2 = vec_xl(i * 2 + 32, y);
909
+ vector unsigned short y3 = vec_xl(i * 2 + 48, y);
 
 
 
 
910
 
911
+ vector float v4 = vec_splats(v);
912
 
913
+ vector float fx0l = vec_extract_fp32_from_shortl(x0);
914
+ vector float fx0h = vec_extract_fp32_from_shorth(x0);
915
+ vector float fx1l = vec_extract_fp32_from_shortl(x1);
916
+ vector float fx1h = vec_extract_fp32_from_shorth(x1);
917
+ vector float fx2l = vec_extract_fp32_from_shortl(x2);
918
+ vector float fx2h = vec_extract_fp32_from_shorth(x2);
919
+ vector float fx3l = vec_extract_fp32_from_shortl(x3);
920
+ vector float fx3h = vec_extract_fp32_from_shorth(x3);
921
 
922
+ vector float fy0l = vec_extract_fp32_from_shortl(y0);
923
+ vector float fy0h = vec_extract_fp32_from_shorth(y0);
924
+ vector float fy1l = vec_extract_fp32_from_shortl(y1);
925
+ vector float fy1h = vec_extract_fp32_from_shorth(y1);
926
+ vector float fy2l = vec_extract_fp32_from_shortl(y2);
927
+ vector float fy2h = vec_extract_fp32_from_shorth(y2);
928
+ vector float fy3l = vec_extract_fp32_from_shortl(y3);
929
+ vector float fy3h = vec_extract_fp32_from_shorth(y3);
930
 
931
+ fy0l = vec_madd(fx0l, v4, fy0l);
932
+ fy0h = vec_madd(fx0h, v4, fy0h);
933
+ fy1l = vec_madd(fx1l, v4, fy1l);
934
+ fy1h = vec_madd(fx1h, v4, fy1h);
935
+ fy2l = vec_madd(fx2l, v4, fy2l);
936
+ fy2h = vec_madd(fx2h, v4, fy2h);
937
+ fy3l = vec_madd(fx3l, v4, fy3l);
938
+ fy3h = vec_madd(fx3h, v4, fy3h);
939
 
940
+ y0 = vec_pack_to_short_fp32(fy0h, fy0l);
941
+ y1 = vec_pack_to_short_fp32(fy1h, fy1l);
942
+ y2 = vec_pack_to_short_fp32(fy2h, fy2l);
943
+ y3 = vec_pack_to_short_fp32(fy3h, fy3l);
944
 
945
+ vec_xst(y0, i * 2 + 0, y);
946
+ vec_xst(y1, i * 2 + 16, y);
947
+ vec_xst(y2, i * 2 + 32, y);
948
+ vec_xst(y3, i * 2 + 48, y);
949
  }
950
 
 
951
  for (int i = n32; i < n; ++i) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
952
  y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
953
  }
954
  #else
 
960
 
961
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
962
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
963
+ #if defined(GGML_SIMD)
964
+ const int np = (n & ~(GGML_F32_STEP - 1));
 
965
 
966
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
967
 
968
+ GGML_F32_VEC ay[GGML_F32_ARR];
 
 
 
 
 
 
969
 
970
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
971
+ for (int j = 0; j < GGML_F32_ARR; j++) {
972
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
973
+ ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
974
 
975
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
976
+ }
 
 
977
  }
978
 
979
  // leftovers
980
+ for (int i = np; i < n; ++i) {
981
  y[i] *= v;
982
  }
983
  #else
 
8255
  #endif
8256
  }
8257
 
8258
+ int ggml_cpu_has_fma(void) {
8259
+ #if defined(__FMA__)
8260
+ return 1;
8261
+ #else
8262
+ return 0;
8263
+ #endif
8264
+ }
8265
+
8266
  int ggml_cpu_has_neon(void) {
8267
  #if defined(__ARM_NEON)
8268
  return 1;
ggml.h CHANGED
@@ -724,6 +724,7 @@ enum ggml_opt_result ggml_opt(
724
  int ggml_cpu_has_avx(void);
725
  int ggml_cpu_has_avx2(void);
726
  int ggml_cpu_has_avx512(void);
 
727
  int ggml_cpu_has_neon(void);
728
  int ggml_cpu_has_arm_fma(void);
729
  int ggml_cpu_has_f16c(void);
 
724
  int ggml_cpu_has_avx(void);
725
  int ggml_cpu_has_avx2(void);
726
  int ggml_cpu_has_avx512(void);
727
+ int ggml_cpu_has_fma(void);
728
  int ggml_cpu_has_neon(void);
729
  int ggml_cpu_has_arm_fma(void);
730
  int ggml_cpu_has_f16c(void);
whisper.cpp CHANGED
@@ -2555,6 +2555,7 @@ const char * whisper_print_system_info(void) {
2555
  s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
2556
  s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
2557
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
 
2558
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
2559
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
2560
  s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
 
2555
  s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
2556
  s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
2557
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
2558
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
2559
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
2560
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
2561
  s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";