Spaces:
Sleeping
Sleeping
ggml : simplify the SIMD code (#324)
Browse files* ggml : simplify the SIMD code
* ggml : generic reduce for all register sizes + comments
- ggml.c +489 -759
- ggml.h +1 -0
- whisper.cpp +1 -0
ggml.c
CHANGED
|
@@ -316,192 +316,426 @@ int64_t ggml_cycles_per_ms(void) {
|
|
| 316 |
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
| 317 |
|
| 318 |
//
|
| 319 |
-
//
|
| 320 |
//
|
| 321 |
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
|
| 333 |
-
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
| 334 |
-
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 335 |
-
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
|
| 336 |
-
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
| 337 |
-
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
| 338 |
-
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
| 339 |
|
| 340 |
-
inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
| 341 |
-
ggml_float sumf = 0.0;
|
| 342 |
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
| 343 |
-
// NEON 128-bit
|
| 344 |
-
const int n16 = (n & ~15);
|
| 345 |
-
|
| 346 |
-
float32x4_t sum0 = vdupq_n_f32(0);
|
| 347 |
-
float32x4_t sum1 = vdupq_n_f32(0);
|
| 348 |
-
float32x4_t sum2 = vdupq_n_f32(0);
|
| 349 |
-
float32x4_t sum3 = vdupq_n_f32(0);
|
| 350 |
|
| 351 |
-
|
| 352 |
-
float32x4_t y0, y1, y2, y3;
|
| 353 |
|
| 354 |
-
|
| 355 |
-
x0 = vld1q_f32(x + i + 0);
|
| 356 |
-
x1 = vld1q_f32(x + i + 4);
|
| 357 |
-
x2 = vld1q_f32(x + i + 8);
|
| 358 |
-
x3 = vld1q_f32(x + i + 12);
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
y2 = vld1q_f32(y + i + 8);
|
| 363 |
-
y3 = vld1q_f32(y + i + 12);
|
| 364 |
-
|
| 365 |
-
sum0 = vfmaq_f32(sum0, x0, y0);
|
| 366 |
-
sum1 = vfmaq_f32(sum1, x1, y1);
|
| 367 |
-
sum2 = vfmaq_f32(sum2, x2, y2);
|
| 368 |
-
sum3 = vfmaq_f32(sum3, x3, y3);
|
| 369 |
-
}
|
| 370 |
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
#
|
| 383 |
-
|
| 384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
-
|
| 387 |
-
__m256 sum1 = _mm256_setzero_ps();
|
| 388 |
-
__m256 sum2 = _mm256_setzero_ps();
|
| 389 |
-
__m256 sum3 = _mm256_setzero_ps();
|
| 390 |
|
| 391 |
-
|
| 392 |
-
__m256 y0, y1, y2, y3;
|
| 393 |
|
| 394 |
-
|
| 395 |
-
x0 = _mm256_loadu_ps(x + i + 0);
|
| 396 |
-
x1 = _mm256_loadu_ps(x + i + 8);
|
| 397 |
-
x2 = _mm256_loadu_ps(x + i + 16);
|
| 398 |
-
x3 = _mm256_loadu_ps(x + i + 24);
|
| 399 |
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
y2 = _mm256_loadu_ps(y + i + 16);
|
| 403 |
-
y3 = _mm256_loadu_ps(y + i + 24);
|
| 404 |
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
|
| 411 |
-
|
| 412 |
-
sum2 = _mm256_add_ps(sum2, sum3);
|
| 413 |
-
sum0 = _mm256_add_ps(sum0, sum2);
|
| 414 |
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
|
| 419 |
-
|
| 420 |
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
#
|
| 426 |
-
|
| 427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
-
|
| 430 |
-
__m256 sum1 = _mm256_setzero_ps();
|
| 431 |
-
__m256 sum2 = _mm256_setzero_ps();
|
| 432 |
-
__m256 sum3 = _mm256_setzero_ps();
|
| 433 |
|
| 434 |
-
|
| 435 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
x2 = _mm256_loadu_ps(x + i + 16);
|
| 441 |
-
x3 = _mm256_loadu_ps(x + i + 24);
|
| 442 |
|
| 443 |
-
|
| 444 |
-
y1 = _mm256_loadu_ps(y + i + 8);
|
| 445 |
-
y2 = _mm256_loadu_ps(y + i + 16);
|
| 446 |
-
y3 = _mm256_loadu_ps(y + i + 24);
|
| 447 |
|
| 448 |
-
|
| 449 |
-
sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1);
|
| 450 |
-
sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2);
|
| 451 |
-
sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3);
|
| 452 |
-
}
|
| 453 |
|
| 454 |
-
|
| 455 |
-
sum2 = _mm256_add_ps(sum2, sum3);
|
| 456 |
-
sum0 = _mm256_add_ps(sum0, sum2);
|
| 457 |
|
| 458 |
-
|
| 459 |
-
const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
|
| 460 |
-
const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
|
| 461 |
|
| 462 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
sumf += x[i]*y[i];
|
| 467 |
-
}
|
| 468 |
-
#elif defined(__wasm_simd128__)
|
| 469 |
-
// WASM 128-bit
|
| 470 |
-
const int n16 = (n & ~15);
|
| 471 |
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
v128_t sum2 = wasm_f32x4_splat(0);
|
| 475 |
-
v128_t sum3 = wasm_f32x4_splat(0);
|
| 476 |
|
| 477 |
-
|
| 478 |
-
v128_t y0, y1, y2, y3;
|
| 479 |
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
x1 = wasm_v128_load(x + i + 4);
|
| 483 |
-
x2 = wasm_v128_load(x + i + 8);
|
| 484 |
-
x3 = wasm_v128_load(x + i + 12);
|
| 485 |
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
|
| 494 |
-
sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
|
| 495 |
}
|
| 496 |
|
| 497 |
-
sum0
|
| 498 |
-
|
| 499 |
-
sum0 = wasm_f32x4_add(sum0, sum2);
|
| 500 |
-
|
| 501 |
-
sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);
|
| 502 |
|
| 503 |
// leftovers
|
| 504 |
-
for (int i =
|
| 505 |
sumf += x[i]*y[i];
|
| 506 |
}
|
| 507 |
#else
|
|
@@ -516,194 +750,34 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
|
|
| 516 |
|
| 517 |
inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
| 518 |
ggml_float sumf = 0.0;
|
| 519 |
-
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
| 520 |
-
const int n32 = (n & ~31);
|
| 521 |
|
| 522 |
-
#if defined(
|
| 523 |
-
|
| 524 |
-
float16x8_t sum1 = vdupq_n_f16(0);
|
| 525 |
-
float16x8_t sum2 = vdupq_n_f16(0);
|
| 526 |
-
float16x8_t sum3 = vdupq_n_f16(0);
|
| 527 |
|
| 528 |
-
|
| 529 |
-
float16x8_t y0, y1, y2, y3;
|
| 530 |
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
x1 = vld1q_f16(x + i + 8 );
|
| 534 |
-
x2 = vld1q_f16(x + i + 16);
|
| 535 |
-
x3 = vld1q_f16(x + i + 24);
|
| 536 |
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
sum2 = vfmaq_f16(sum2, x2, y2);
|
| 545 |
-
sum3 = vfmaq_f16(sum3, x3, y3);
|
| 546 |
}
|
| 547 |
|
| 548 |
// reduce sum0..sum3 to sum0
|
| 549 |
-
|
| 550 |
-
sum2 = vaddq_f16(sum2, sum3);
|
| 551 |
-
sum0 = vaddq_f16(sum0, sum2);
|
| 552 |
-
|
| 553 |
-
// load sum0 into 2 float32x4_t
|
| 554 |
-
float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0));
|
| 555 |
-
float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0));
|
| 556 |
-
|
| 557 |
-
// reduce sum0f32 and sum1f32 to sumf
|
| 558 |
-
sum0f32 = vaddq_f32(sum0f32, sum1f32);
|
| 559 |
-
sumf = vaddvq_f32(sum0f32);
|
| 560 |
-
#else
|
| 561 |
-
float32x4_t sum0 = vdupq_n_f32(0);
|
| 562 |
-
float32x4_t sum1 = vdupq_n_f32(0);
|
| 563 |
-
float32x4_t sum2 = vdupq_n_f32(0);
|
| 564 |
-
float32x4_t sum3 = vdupq_n_f32(0);
|
| 565 |
-
float32x4_t sum4 = vdupq_n_f32(0);
|
| 566 |
-
float32x4_t sum5 = vdupq_n_f32(0);
|
| 567 |
-
float32x4_t sum6 = vdupq_n_f32(0);
|
| 568 |
-
float32x4_t sum7 = vdupq_n_f32(0);
|
| 569 |
-
|
| 570 |
-
float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
|
| 571 |
-
float32x4_t y0, y1, y2, y3, y4, y5, y6, y7;
|
| 572 |
-
|
| 573 |
-
for (int i = 0; i < n32; i += 32) {
|
| 574 |
-
x0 = vcvt_f32_f16(vld1_f16(x + i + 0 ));
|
| 575 |
-
x1 = vcvt_f32_f16(vld1_f16(x + i + 4 ));
|
| 576 |
-
x2 = vcvt_f32_f16(vld1_f16(x + i + 8 ));
|
| 577 |
-
x3 = vcvt_f32_f16(vld1_f16(x + i + 12));
|
| 578 |
-
x4 = vcvt_f32_f16(vld1_f16(x + i + 16));
|
| 579 |
-
x5 = vcvt_f32_f16(vld1_f16(x + i + 20));
|
| 580 |
-
x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
|
| 581 |
-
x7 = vcvt_f32_f16(vld1_f16(x + i + 28));
|
| 582 |
-
|
| 583 |
-
y0 = vcvt_f32_f16(vld1_f16(y + i + 0 ));
|
| 584 |
-
y1 = vcvt_f32_f16(vld1_f16(y + i + 4 ));
|
| 585 |
-
y2 = vcvt_f32_f16(vld1_f16(y + i + 8 ));
|
| 586 |
-
y3 = vcvt_f32_f16(vld1_f16(y + i + 12));
|
| 587 |
-
y4 = vcvt_f32_f16(vld1_f16(y + i + 16));
|
| 588 |
-
y5 = vcvt_f32_f16(vld1_f16(y + i + 20));
|
| 589 |
-
y6 = vcvt_f32_f16(vld1_f16(y + i + 24));
|
| 590 |
-
y7 = vcvt_f32_f16(vld1_f16(y + i + 28));
|
| 591 |
-
|
| 592 |
-
sum0 = vfmaq_f32(sum0, x0, y0);
|
| 593 |
-
sum1 = vfmaq_f32(sum1, x1, y1);
|
| 594 |
-
sum2 = vfmaq_f32(sum2, x2, y2);
|
| 595 |
-
sum3 = vfmaq_f32(sum3, x3, y3);
|
| 596 |
-
sum4 = vfmaq_f32(sum4, x4, y4);
|
| 597 |
-
sum5 = vfmaq_f32(sum5, x5, y5);
|
| 598 |
-
sum6 = vfmaq_f32(sum6, x6, y6);
|
| 599 |
-
sum7 = vfmaq_f32(sum7, x7, y7);
|
| 600 |
-
}
|
| 601 |
-
|
| 602 |
-
// reduce sum0..sum7 to sum0
|
| 603 |
-
sum0 = vaddq_f32(sum0, sum1);
|
| 604 |
-
sum2 = vaddq_f32(sum2, sum3);
|
| 605 |
-
sum4 = vaddq_f32(sum4, sum5);
|
| 606 |
-
sum6 = vaddq_f32(sum6, sum7);
|
| 607 |
-
sum0 = vaddq_f32(sum0, sum2);
|
| 608 |
-
sum4 = vaddq_f32(sum4, sum6);
|
| 609 |
-
sum0 = vaddq_f32(sum0, sum4);
|
| 610 |
-
|
| 611 |
-
sumf = vaddvq_f32(sum0);
|
| 612 |
-
#endif
|
| 613 |
|
| 614 |
// leftovers
|
| 615 |
-
for (int i =
|
| 616 |
-
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 617 |
-
}
|
| 618 |
-
#elif defined(__AVX2__)
|
| 619 |
-
// AVX 256-bit
|
| 620 |
-
const int n32 = (n & ~31);
|
| 621 |
-
|
| 622 |
-
__m256 sum0 = _mm256_setzero_ps();
|
| 623 |
-
__m256 sum1 = _mm256_setzero_ps();
|
| 624 |
-
__m256 sum2 = _mm256_setzero_ps();
|
| 625 |
-
__m256 sum3 = _mm256_setzero_ps();
|
| 626 |
-
|
| 627 |
-
__m256 x0, x1, x2, x3;
|
| 628 |
-
__m256 y0, y1, y2, y3;
|
| 629 |
-
|
| 630 |
-
for (int i = 0; i < n32; i += 32) {
|
| 631 |
-
x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
|
| 632 |
-
x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
|
| 633 |
-
x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
|
| 634 |
-
x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
|
| 635 |
-
|
| 636 |
-
y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
|
| 637 |
-
y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
|
| 638 |
-
y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
|
| 639 |
-
y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
|
| 640 |
-
|
| 641 |
-
sum0 = _mm256_fmadd_ps(x0, y0, sum0);
|
| 642 |
-
sum1 = _mm256_fmadd_ps(x1, y1, sum1);
|
| 643 |
-
sum2 = _mm256_fmadd_ps(x2, y2, sum2);
|
| 644 |
-
sum3 = _mm256_fmadd_ps(x3, y3, sum3);
|
| 645 |
-
}
|
| 646 |
-
|
| 647 |
-
const __m256 sum01 = _mm256_add_ps(sum0, sum1);
|
| 648 |
-
const __m256 sum23 = _mm256_add_ps(sum2, sum3);
|
| 649 |
-
const __m256 sum0123 = _mm256_add_ps(sum01, sum23);
|
| 650 |
-
|
| 651 |
-
const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1));
|
| 652 |
-
const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
|
| 653 |
-
const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
|
| 654 |
-
|
| 655 |
-
sumf = _mm_cvtss_f32(r1);
|
| 656 |
-
|
| 657 |
-
// leftovers
|
| 658 |
-
for (int i = n32; i < n; ++i) {
|
| 659 |
-
//GGML_ASSERT(false);
|
| 660 |
-
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 661 |
-
}
|
| 662 |
-
#elif defined(__AVX__)
|
| 663 |
-
// AVX 256-bit
|
| 664 |
-
const int n32 = (n & ~31);
|
| 665 |
-
|
| 666 |
-
__m256 sum0 = _mm256_setzero_ps();
|
| 667 |
-
__m256 sum1 = _mm256_setzero_ps();
|
| 668 |
-
__m256 sum2 = _mm256_setzero_ps();
|
| 669 |
-
__m256 sum3 = _mm256_setzero_ps();
|
| 670 |
-
|
| 671 |
-
__m256 x0, x1, x2, x3;
|
| 672 |
-
__m256 y0, y1, y2, y3;
|
| 673 |
-
|
| 674 |
-
for (int i = 0; i < n32; i += 32) {
|
| 675 |
-
x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
|
| 676 |
-
x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
|
| 677 |
-
x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
|
| 678 |
-
x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
|
| 679 |
-
|
| 680 |
-
y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
|
| 681 |
-
y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
|
| 682 |
-
y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
|
| 683 |
-
y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
|
| 684 |
-
|
| 685 |
-
sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0);
|
| 686 |
-
sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1);
|
| 687 |
-
sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2);
|
| 688 |
-
sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3);
|
| 689 |
-
}
|
| 690 |
-
|
| 691 |
-
const __m256 sum01 = _mm256_add_ps(sum0, sum1);
|
| 692 |
-
const __m256 sum23 = _mm256_add_ps(sum2, sum3);
|
| 693 |
-
const __m256 sum0123 = _mm256_add_ps(sum01, sum23);
|
| 694 |
-
|
| 695 |
-
const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1));
|
| 696 |
-
const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
|
| 697 |
-
const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
|
| 698 |
-
|
| 699 |
-
sumf = _mm_cvtss_f32(r1);
|
| 700 |
-
|
| 701 |
-
// leftovers
|
| 702 |
-
for (int i = n32; i < n; ++i) {
|
| 703 |
-
//GGML_ASSERT(false);
|
| 704 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 705 |
}
|
| 706 |
#elif defined(__POWER9_VECTOR__)
|
|
|
|
|
|
|
| 707 |
const int n32 = (n & ~31);
|
| 708 |
|
| 709 |
vector float sum0 = vec_splats (0.0f);
|
|
@@ -754,54 +828,6 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 754 |
for (int i = n32; i < n; ++i) {
|
| 755 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 756 |
}
|
| 757 |
-
#elif defined(__wasm_simd128__)
|
| 758 |
-
// WASM 128-bit
|
| 759 |
-
const int n16 = (n & ~15);
|
| 760 |
-
|
| 761 |
-
v128_t sum0 = wasm_f32x4_splat(0.0f);
|
| 762 |
-
v128_t sum1 = wasm_f32x4_splat(0.0f);
|
| 763 |
-
v128_t sum2 = wasm_f32x4_splat(0.0f);
|
| 764 |
-
v128_t sum3 = wasm_f32x4_splat(0.0f);
|
| 765 |
-
|
| 766 |
-
v128_t x0, x1, x2, x3;
|
| 767 |
-
v128_t y0, y1, y2, y3;
|
| 768 |
-
|
| 769 |
-
float tx[16];
|
| 770 |
-
float ty[16];
|
| 771 |
-
|
| 772 |
-
for (int i = 0; i < n16; i += 16) {
|
| 773 |
-
for (int k = 0; k < 16; ++k) {
|
| 774 |
-
tx[k] = GGML_FP16_TO_FP32(x[i + k]);
|
| 775 |
-
ty[k] = GGML_FP16_TO_FP32(y[i + k]);
|
| 776 |
-
}
|
| 777 |
-
|
| 778 |
-
x0 = wasm_v128_load(tx + 0);
|
| 779 |
-
x1 = wasm_v128_load(tx + 4);
|
| 780 |
-
x2 = wasm_v128_load(tx + 8);
|
| 781 |
-
x3 = wasm_v128_load(tx + 12);
|
| 782 |
-
|
| 783 |
-
y0 = wasm_v128_load(ty + 0);
|
| 784 |
-
y1 = wasm_v128_load(ty + 4);
|
| 785 |
-
y2 = wasm_v128_load(ty + 8);
|
| 786 |
-
y3 = wasm_v128_load(ty + 12);
|
| 787 |
-
|
| 788 |
-
sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
|
| 789 |
-
sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
|
| 790 |
-
sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
|
| 791 |
-
sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
|
| 792 |
-
}
|
| 793 |
-
|
| 794 |
-
sum0 = wasm_f32x4_add(sum0, sum1);
|
| 795 |
-
sum2 = wasm_f32x4_add(sum2, sum3);
|
| 796 |
-
sum0 = wasm_f32x4_add(sum0, sum2);
|
| 797 |
-
|
| 798 |
-
sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);
|
| 799 |
-
|
| 800 |
-
// leftovers
|
| 801 |
-
for (int i = n16; i < n; ++i) {
|
| 802 |
-
//GGML_ASSERT(false);
|
| 803 |
-
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 804 |
-
}
|
| 805 |
#else
|
| 806 |
for (int i = 0; i < n; ++i) {
|
| 807 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
|
@@ -812,144 +838,26 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 812 |
}
|
| 813 |
|
| 814 |
inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
|
| 815 |
-
#if defined(
|
| 816 |
-
|
| 817 |
-
const int n16 = (n & ~15);
|
| 818 |
-
|
| 819 |
-
const float32x4_t v4 = vdupq_n_f32(v);
|
| 820 |
-
|
| 821 |
-
float32x4_t x0, x1, x2, x3;
|
| 822 |
-
float32x4_t y0, y1, y2, y3;
|
| 823 |
-
|
| 824 |
-
for (int i = 0; i < n16; i += 16) {
|
| 825 |
-
x0 = vld1q_f32(x + i + 0);
|
| 826 |
-
x1 = vld1q_f32(x + i + 4);
|
| 827 |
-
x2 = vld1q_f32(x + i + 8);
|
| 828 |
-
x3 = vld1q_f32(x + i + 12);
|
| 829 |
-
|
| 830 |
-
y0 = vld1q_f32(y + i + 0);
|
| 831 |
-
y1 = vld1q_f32(y + i + 4);
|
| 832 |
-
y2 = vld1q_f32(y + i + 8);
|
| 833 |
-
y3 = vld1q_f32(y + i + 12);
|
| 834 |
-
|
| 835 |
-
y0 = vfmaq_f32(y0, x0, v4);
|
| 836 |
-
y1 = vfmaq_f32(y1, x1, v4);
|
| 837 |
-
y2 = vfmaq_f32(y2, x2, v4);
|
| 838 |
-
y3 = vfmaq_f32(y3, x3, v4);
|
| 839 |
-
|
| 840 |
-
vst1q_f32(y + i + 0, y0);
|
| 841 |
-
vst1q_f32(y + i + 4, y1);
|
| 842 |
-
vst1q_f32(y + i + 8, y2);
|
| 843 |
-
vst1q_f32(y + i + 12, y3);
|
| 844 |
-
}
|
| 845 |
-
|
| 846 |
-
// leftovers
|
| 847 |
-
for (int i = n16; i < n; ++i) {
|
| 848 |
-
y[i] += x[i]*v;
|
| 849 |
-
}
|
| 850 |
-
#elif defined(__AVX2__)
|
| 851 |
-
// AVX 256-bit
|
| 852 |
-
const int n32 = (n & ~31);
|
| 853 |
-
|
| 854 |
-
const __m256 v4 = _mm256_set1_ps(v);
|
| 855 |
|
| 856 |
-
|
| 857 |
-
__m256 y0, y1, y2, y3;
|
| 858 |
-
|
| 859 |
-
for (int i = 0; i < n32; i += 32) {
|
| 860 |
-
x0 = _mm256_loadu_ps(x + i + 0);
|
| 861 |
-
x1 = _mm256_loadu_ps(x + i + 8);
|
| 862 |
-
x2 = _mm256_loadu_ps(x + i + 16);
|
| 863 |
-
x3 = _mm256_loadu_ps(x + i + 24);
|
| 864 |
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
y2 = _mm256_loadu_ps(y + i + 16);
|
| 868 |
-
y3 = _mm256_loadu_ps(y + i + 24);
|
| 869 |
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
|
|
|
| 874 |
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
_mm256_storeu_ps(y + i + 16, y2);
|
| 878 |
-
_mm256_storeu_ps(y + i + 24, y3);
|
| 879 |
-
}
|
| 880 |
-
|
| 881 |
-
// leftovers
|
| 882 |
-
for (int i = n32; i < n; ++i) {
|
| 883 |
-
y[i] += x[i]*v;
|
| 884 |
-
}
|
| 885 |
-
#elif defined(__AVX__)
|
| 886 |
-
// AVX 256-bit
|
| 887 |
-
const int n32 = (n & ~31);
|
| 888 |
-
|
| 889 |
-
const __m256 v4 = _mm256_set1_ps(v);
|
| 890 |
-
|
| 891 |
-
__m256 x0, x1, x2, x3;
|
| 892 |
-
__m256 y0, y1, y2, y3;
|
| 893 |
-
|
| 894 |
-
for (int i = 0; i < n32; i += 32) {
|
| 895 |
-
x0 = _mm256_loadu_ps(x + i + 0);
|
| 896 |
-
x1 = _mm256_loadu_ps(x + i + 8);
|
| 897 |
-
x2 = _mm256_loadu_ps(x + i + 16);
|
| 898 |
-
x3 = _mm256_loadu_ps(x + i + 24);
|
| 899 |
-
|
| 900 |
-
y0 = _mm256_loadu_ps(y + i + 0);
|
| 901 |
-
y1 = _mm256_loadu_ps(y + i + 8);
|
| 902 |
-
y2 = _mm256_loadu_ps(y + i + 16);
|
| 903 |
-
y3 = _mm256_loadu_ps(y + i + 24);
|
| 904 |
-
|
| 905 |
-
y0 = _mm256_add_ps(_mm256_mul_ps(x0, v4), y0);
|
| 906 |
-
y1 = _mm256_add_ps(_mm256_mul_ps(x1, v4), y1);
|
| 907 |
-
y2 = _mm256_add_ps(_mm256_mul_ps(x2, v4), y2);
|
| 908 |
-
y3 = _mm256_add_ps(_mm256_mul_ps(x3, v4), y3);
|
| 909 |
-
|
| 910 |
-
_mm256_storeu_ps(y + i + 0, y0);
|
| 911 |
-
_mm256_storeu_ps(y + i + 8, y1);
|
| 912 |
-
_mm256_storeu_ps(y + i + 16, y2);
|
| 913 |
-
_mm256_storeu_ps(y + i + 24, y3);
|
| 914 |
-
}
|
| 915 |
-
|
| 916 |
-
// leftovers
|
| 917 |
-
for (int i = n32; i < n; ++i) {
|
| 918 |
-
y[i] += x[i]*v;
|
| 919 |
-
}
|
| 920 |
-
#elif defined(__wasm_simd128__)
|
| 921 |
-
// WASM SIMD 128-bit
|
| 922 |
-
const int n16 = (n & ~15);
|
| 923 |
-
|
| 924 |
-
const v128_t v4 = wasm_f32x4_splat(v);
|
| 925 |
-
|
| 926 |
-
v128_t x0, x1, x2, x3;
|
| 927 |
-
v128_t y0, y1, y2, y3;
|
| 928 |
-
|
| 929 |
-
for (int i = 0; i < n16; i += 16) {
|
| 930 |
-
x0 = wasm_v128_load(x + i + 0);
|
| 931 |
-
x1 = wasm_v128_load(x + i + 4);
|
| 932 |
-
x2 = wasm_v128_load(x + i + 8);
|
| 933 |
-
x3 = wasm_v128_load(x + i + 12);
|
| 934 |
-
|
| 935 |
-
y0 = wasm_v128_load(y + i + 0);
|
| 936 |
-
y1 = wasm_v128_load(y + i + 4);
|
| 937 |
-
y2 = wasm_v128_load(y + i + 8);
|
| 938 |
-
y3 = wasm_v128_load(y + i + 12);
|
| 939 |
-
|
| 940 |
-
y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
|
| 941 |
-
y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
|
| 942 |
-
y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
|
| 943 |
-
y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));
|
| 944 |
-
|
| 945 |
-
wasm_v128_store(y + i + 0, y0);
|
| 946 |
-
wasm_v128_store(y + i + 4, y1);
|
| 947 |
-
wasm_v128_store(y + i + 8, y2);
|
| 948 |
-
wasm_v128_store(y + i + 12, y3);
|
| 949 |
}
|
| 950 |
|
| 951 |
// leftovers
|
| 952 |
-
for (int i =
|
| 953 |
y[i] += x[i]*v;
|
| 954 |
}
|
| 955 |
#else
|
|
@@ -961,263 +869,86 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
|
| 961 |
}
|
| 962 |
|
| 963 |
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) {
|
| 964 |
-
#if defined(
|
| 965 |
-
|
| 966 |
-
const int n32 = (n & ~31);
|
| 967 |
-
|
| 968 |
-
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
| 969 |
-
const float16x8_t v8 = vdupq_n_f16(v);
|
| 970 |
|
| 971 |
-
|
| 972 |
-
float16x8_t y0, y1, y2, y3;
|
| 973 |
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
y1 = vld1q_f16(y + i + 8 );
|
| 977 |
-
y2 = vld1q_f16(y + i + 16);
|
| 978 |
-
y3 = vld1q_f16(y + i + 24);
|
| 979 |
-
|
| 980 |
-
x0 = vld1q_f16(x + i + 0 );
|
| 981 |
-
x1 = vld1q_f16(x + i + 8 );
|
| 982 |
-
x2 = vld1q_f16(x + i + 16);
|
| 983 |
-
x3 = vld1q_f16(x + i + 24);
|
| 984 |
-
|
| 985 |
-
y0 = vfmaq_f16(y0, x0, v8);
|
| 986 |
-
y1 = vfmaq_f16(y1, x1, v8);
|
| 987 |
-
y2 = vfmaq_f16(y2, x2, v8);
|
| 988 |
-
y3 = vfmaq_f16(y3, x3, v8);
|
| 989 |
-
|
| 990 |
-
vst1q_f16(y + i + 0 , y0);
|
| 991 |
-
vst1q_f16(y + i + 8 , y1);
|
| 992 |
-
vst1q_f16(y + i + 16, y2);
|
| 993 |
-
vst1q_f16(y + i + 24, y3);
|
| 994 |
-
}
|
| 995 |
-
#else
|
| 996 |
-
const float32x4_t v40 = vdupq_n_f32(v);
|
| 997 |
-
const float32x4_t v41 = vdupq_n_f32(v);
|
| 998 |
|
| 999 |
-
|
| 1000 |
-
|
|
|
|
|
|
|
|
|
|
| 1001 |
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
y1 = vcvt_f32_f16(vld1_f16(y + i + 4 ));
|
| 1005 |
-
y2 = vcvt_f32_f16(vld1_f16(y + i + 8 ));
|
| 1006 |
-
y3 = vcvt_f32_f16(vld1_f16(y + i + 12));
|
| 1007 |
-
y4 = vcvt_f32_f16(vld1_f16(y + i + 16));
|
| 1008 |
-
y5 = vcvt_f32_f16(vld1_f16(y + i + 20));
|
| 1009 |
-
y6 = vcvt_f32_f16(vld1_f16(y + i + 24));
|
| 1010 |
-
y7 = vcvt_f32_f16(vld1_f16(y + i + 28));
|
| 1011 |
-
|
| 1012 |
-
x0 = vcvt_f32_f16(vld1_f16(x + i + 0 ));
|
| 1013 |
-
x1 = vcvt_f32_f16(vld1_f16(x + i + 4 ));
|
| 1014 |
-
x2 = vcvt_f32_f16(vld1_f16(x + i + 8 ));
|
| 1015 |
-
x3 = vcvt_f32_f16(vld1_f16(x + i + 12));
|
| 1016 |
-
x4 = vcvt_f32_f16(vld1_f16(x + i + 16));
|
| 1017 |
-
x5 = vcvt_f32_f16(vld1_f16(x + i + 20));
|
| 1018 |
-
x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
|
| 1019 |
-
x7 = vcvt_f32_f16(vld1_f16(x + i + 28));
|
| 1020 |
-
|
| 1021 |
-
y0 = vfmaq_f32(y0, x0, v40);
|
| 1022 |
-
y1 = vfmaq_f32(y1, x1, v40);
|
| 1023 |
-
y2 = vfmaq_f32(y2, x2, v40);
|
| 1024 |
-
y3 = vfmaq_f32(y3, x3, v40);
|
| 1025 |
-
y4 = vfmaq_f32(y4, x4, v41);
|
| 1026 |
-
y5 = vfmaq_f32(y5, x5, v41);
|
| 1027 |
-
y6 = vfmaq_f32(y6, x6, v41);
|
| 1028 |
-
y7 = vfmaq_f32(y7, x7, v41);
|
| 1029 |
-
|
| 1030 |
-
vst1_f16(y + i + 0 , vcvt_f16_f32(y0));
|
| 1031 |
-
vst1_f16(y + i + 4 , vcvt_f16_f32(y1));
|
| 1032 |
-
vst1_f16(y + i + 8 , vcvt_f16_f32(y2));
|
| 1033 |
-
vst1_f16(y + i + 12, vcvt_f16_f32(y3));
|
| 1034 |
-
vst1_f16(y + i + 16, vcvt_f16_f32(y4));
|
| 1035 |
-
vst1_f16(y + i + 20, vcvt_f16_f32(y5));
|
| 1036 |
-
vst1_f16(y + i + 24, vcvt_f16_f32(y6));
|
| 1037 |
-
vst1_f16(y + i + 28, vcvt_f16_f32(y7));
|
| 1038 |
}
|
| 1039 |
-
#endif
|
| 1040 |
|
| 1041 |
// leftovers
|
| 1042 |
-
for (int i =
|
| 1043 |
GGML_ASSERT(false);
|
| 1044 |
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1045 |
}
|
| 1046 |
-
#elif defined(
|
| 1047 |
-
//
|
|
|
|
| 1048 |
const int n32 = (n & ~31);
|
| 1049 |
-
|
| 1050 |
-
const __m256 v8 = _mm256_set1_ps(v);
|
| 1051 |
-
|
| 1052 |
-
__m256 x0, x1, x2, x3;
|
| 1053 |
-
__m256 y0, y1, y2, y3;
|
| 1054 |
-
|
| 1055 |
for (int i = 0; i < n32; i += 32) {
|
| 1056 |
-
|
| 1057 |
-
|
| 1058 |
-
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
|
| 1062 |
-
x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
|
| 1063 |
-
x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
|
| 1064 |
-
x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
|
| 1065 |
-
|
| 1066 |
-
y0 = _mm256_fmadd_ps(x0, v8, y0);
|
| 1067 |
-
y1 = _mm256_fmadd_ps(x1, v8, y1);
|
| 1068 |
-
y2 = _mm256_fmadd_ps(x2, v8, y2);
|
| 1069 |
-
y3 = _mm256_fmadd_ps(x3, v8, y3);
|
| 1070 |
-
|
| 1071 |
-
_mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0));
|
| 1072 |
-
_mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0));
|
| 1073 |
-
_mm_storeu_si128((__m128i*)(y + i + 16), _mm256_cvtps_ph(y2, 0));
|
| 1074 |
-
_mm_storeu_si128((__m128i*)(y + i + 24), _mm256_cvtps_ph(y3, 0));
|
| 1075 |
-
}
|
| 1076 |
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
}
|
| 1082 |
-
#elif defined(__AVX__)
|
| 1083 |
-
// AVX 256-bit
|
| 1084 |
-
const int n32 = (n & ~31);
|
| 1085 |
|
| 1086 |
-
|
| 1087 |
|
| 1088 |
-
|
| 1089 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1090 |
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
|
|
|
|
|
|
|
|
|
| 1096 |
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1101 |
|
| 1102 |
-
y0 =
|
| 1103 |
-
y1 =
|
| 1104 |
-
y2 =
|
| 1105 |
-
y3 =
|
| 1106 |
|
| 1107 |
-
|
| 1108 |
-
|
| 1109 |
-
|
| 1110 |
-
|
| 1111 |
}
|
| 1112 |
|
| 1113 |
-
// leftovers
|
| 1114 |
for (int i = n32; i < n; ++i) {
|
| 1115 |
-
GGML_ASSERT(false);
|
| 1116 |
-
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1117 |
-
}
|
| 1118 |
-
#elif defined(__POWER9_VECTOR__)
|
| 1119 |
-
const int n32 = (n & ~31);
|
| 1120 |
-
for (int i = 0; i < n32; i += 32) {
|
| 1121 |
-
// Use vec_xl, not vec_ld, because x is sometimes unaligned!
|
| 1122 |
-
vector unsigned short x0 = vec_xl(i * 2 + 0, x);
|
| 1123 |
-
vector unsigned short x1 = vec_xl(i * 2 + 16, x);
|
| 1124 |
-
vector unsigned short x2 = vec_xl(i * 2 + 32, x);
|
| 1125 |
-
vector unsigned short x3 = vec_xl(i * 2 + 48, x);
|
| 1126 |
-
|
| 1127 |
-
vector unsigned short y0 = vec_xl(i * 2 + 0, y);
|
| 1128 |
-
vector unsigned short y1 = vec_xl(i * 2 + 16, y);
|
| 1129 |
-
vector unsigned short y2 = vec_xl(i * 2 + 32, y);
|
| 1130 |
-
vector unsigned short y3 = vec_xl(i * 2 + 48, y);
|
| 1131 |
-
|
| 1132 |
-
vector float v4 = vec_splats(v);
|
| 1133 |
-
|
| 1134 |
-
vector float fx0l = vec_extract_fp32_from_shortl(x0);
|
| 1135 |
-
vector float fx0h = vec_extract_fp32_from_shorth(x0);
|
| 1136 |
-
vector float fx1l = vec_extract_fp32_from_shortl(x1);
|
| 1137 |
-
vector float fx1h = vec_extract_fp32_from_shorth(x1);
|
| 1138 |
-
vector float fx2l = vec_extract_fp32_from_shortl(x2);
|
| 1139 |
-
vector float fx2h = vec_extract_fp32_from_shorth(x2);
|
| 1140 |
-
vector float fx3l = vec_extract_fp32_from_shortl(x3);
|
| 1141 |
-
vector float fx3h = vec_extract_fp32_from_shorth(x3);
|
| 1142 |
-
|
| 1143 |
-
vector float fy0l = vec_extract_fp32_from_shortl(y0);
|
| 1144 |
-
vector float fy0h = vec_extract_fp32_from_shorth(y0);
|
| 1145 |
-
vector float fy1l = vec_extract_fp32_from_shortl(y1);
|
| 1146 |
-
vector float fy1h = vec_extract_fp32_from_shorth(y1);
|
| 1147 |
-
vector float fy2l = vec_extract_fp32_from_shortl(y2);
|
| 1148 |
-
vector float fy2h = vec_extract_fp32_from_shorth(y2);
|
| 1149 |
-
vector float fy3l = vec_extract_fp32_from_shortl(y3);
|
| 1150 |
-
vector float fy3h = vec_extract_fp32_from_shorth(y3);
|
| 1151 |
-
|
| 1152 |
-
fy0l = vec_madd(fx0l, v4, fy0l);
|
| 1153 |
-
fy0h = vec_madd(fx0h, v4, fy0h);
|
| 1154 |
-
fy1l = vec_madd(fx1l, v4, fy1l);
|
| 1155 |
-
fy1h = vec_madd(fx1h, v4, fy1h);
|
| 1156 |
-
fy2l = vec_madd(fx2l, v4, fy2l);
|
| 1157 |
-
fy2h = vec_madd(fx2h, v4, fy2h);
|
| 1158 |
-
fy3l = vec_madd(fx3l, v4, fy3l);
|
| 1159 |
-
fy3h = vec_madd(fx3h, v4, fy3h);
|
| 1160 |
-
|
| 1161 |
-
y0 = vec_pack_to_short_fp32(fy0h, fy0l);
|
| 1162 |
-
y1 = vec_pack_to_short_fp32(fy1h, fy1l);
|
| 1163 |
-
y2 = vec_pack_to_short_fp32(fy2h, fy2l);
|
| 1164 |
-
y3 = vec_pack_to_short_fp32(fy3h, fy3l);
|
| 1165 |
-
|
| 1166 |
-
vec_xst(y0, i * 2 + 0, y);
|
| 1167 |
-
vec_xst(y1, i * 2 + 16, y);
|
| 1168 |
-
vec_xst(y2, i * 2 + 32, y);
|
| 1169 |
-
vec_xst(y3, i * 2 + 48, y);
|
| 1170 |
-
}
|
| 1171 |
-
|
| 1172 |
-
for (int i = n32; i < n; ++i) {
|
| 1173 |
-
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1174 |
-
}
|
| 1175 |
-
#elif defined(__wasm_simd128__)
|
| 1176 |
-
// WASM SIMD 128-bit
|
| 1177 |
-
const int n16 = (n & ~15);
|
| 1178 |
-
|
| 1179 |
-
const v128_t v4 = wasm_f32x4_splat(v);
|
| 1180 |
-
|
| 1181 |
-
v128_t x0, x1, x2, x3;
|
| 1182 |
-
v128_t y0, y1, y2, y3;
|
| 1183 |
-
|
| 1184 |
-
float tx[16];
|
| 1185 |
-
float ty[16];
|
| 1186 |
-
|
| 1187 |
-
for (int i = 0; i < n16; i += 16) {
|
| 1188 |
-
for (int k = 0; k < 16; ++k) {
|
| 1189 |
-
tx[k] = GGML_FP16_TO_FP32(x[i + k]);
|
| 1190 |
-
ty[k] = GGML_FP16_TO_FP32(y[i + k]);
|
| 1191 |
-
}
|
| 1192 |
-
|
| 1193 |
-
x0 = wasm_v128_load(tx + 0);
|
| 1194 |
-
x1 = wasm_v128_load(tx + 4);
|
| 1195 |
-
x2 = wasm_v128_load(tx + 8);
|
| 1196 |
-
x3 = wasm_v128_load(tx + 12);
|
| 1197 |
-
|
| 1198 |
-
y0 = wasm_v128_load(ty + 0);
|
| 1199 |
-
y1 = wasm_v128_load(ty + 4);
|
| 1200 |
-
y2 = wasm_v128_load(ty + 8);
|
| 1201 |
-
y3 = wasm_v128_load(ty + 12);
|
| 1202 |
-
|
| 1203 |
-
y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
|
| 1204 |
-
y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
|
| 1205 |
-
y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
|
| 1206 |
-
y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));
|
| 1207 |
-
|
| 1208 |
-
wasm_v128_store(ty + 0, y0);
|
| 1209 |
-
wasm_v128_store(ty + 4, y1);
|
| 1210 |
-
wasm_v128_store(ty + 8, y2);
|
| 1211 |
-
wasm_v128_store(ty + 12, y3);
|
| 1212 |
-
|
| 1213 |
-
for (int k = 0; k < 16; ++k) {
|
| 1214 |
-
y[i + k] = GGML_FP32_TO_FP16(ty[k]);
|
| 1215 |
-
}
|
| 1216 |
-
}
|
| 1217 |
-
|
| 1218 |
-
// leftovers
|
| 1219 |
-
for (int i = n16; i < n; ++i) {
|
| 1220 |
-
GGML_ASSERT(false);
|
| 1221 |
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1222 |
}
|
| 1223 |
#else
|
|
@@ -1229,33 +960,24 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 1229 |
|
| 1230 |
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
|
| 1231 |
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
| 1232 |
-
#if defined(
|
| 1233 |
-
|
| 1234 |
-
const int n32 = (n & ~31);
|
| 1235 |
|
| 1236 |
-
|
| 1237 |
|
| 1238 |
-
|
| 1239 |
-
|
| 1240 |
-
for (int i = 0; i < n32; i += 32) {
|
| 1241 |
-
y0 = _mm256_loadu_ps(y + i + 0);
|
| 1242 |
-
y1 = _mm256_loadu_ps(y + i + 8);
|
| 1243 |
-
y2 = _mm256_loadu_ps(y + i + 16);
|
| 1244 |
-
y3 = _mm256_loadu_ps(y + i + 24);
|
| 1245 |
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
-
|
| 1250 |
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
_mm256_storeu_ps(y + i + 16, y2);
|
| 1254 |
-
_mm256_storeu_ps(y + i + 24, y3);
|
| 1255 |
}
|
| 1256 |
|
| 1257 |
// leftovers
|
| 1258 |
-
for (int i =
|
| 1259 |
y[i] *= v;
|
| 1260 |
}
|
| 1261 |
#else
|
|
@@ -8533,6 +8255,14 @@ int ggml_cpu_has_avx512(void) {
|
|
| 8533 |
#endif
|
| 8534 |
}
|
| 8535 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8536 |
int ggml_cpu_has_neon(void) {
|
| 8537 |
#if defined(__ARM_NEON)
|
| 8538 |
return 1;
|
|
|
|
| 316 |
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
| 317 |
|
| 318 |
//
|
| 319 |
+
// simd mappings
|
| 320 |
//
|
| 321 |
|
| 322 |
+
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
| 323 |
+
// we then implement the fundamental computation operations below using only these macros
|
| 324 |
+
// adding support for new architectures requires to define the corresponding SIMD macros
|
| 325 |
+
//
|
| 326 |
+
// GGML_F32_STEP / GGML_F16_STEP
|
| 327 |
+
// number of elements to process in a single step
|
| 328 |
+
//
|
| 329 |
+
// GGML_F32_EPR / GGML_F16_EPR
|
| 330 |
+
// number of elements to fit in a single register
|
| 331 |
+
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
|
|
|
|
|
|
| 333 |
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
+
#define GGML_SIMD
|
|
|
|
| 336 |
|
| 337 |
+
// F32 NEON
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
+
#define GGML_F32_STEP 16
|
| 340 |
+
#define GGML_F32_EPR 4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
+
#define GGML_F32x4 float32x4_t
|
| 343 |
+
#define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
|
| 344 |
+
#define GGML_F32x4_SET1(x) vdupq_n_f32(x)
|
| 345 |
+
#define GGML_F32x4_LOAD vld1q_f32
|
| 346 |
+
#define GGML_F32x4_STORE vst1q_f32
|
| 347 |
+
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
| 348 |
+
#define GGML_F32x4_ADD vaddq_f32
|
| 349 |
+
#define GGML_F32x4_MUL vmulq_f32
|
| 350 |
+
#if defined(__ARM_FEATURE_QRDMX)
|
| 351 |
+
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
| 352 |
+
#else
|
| 353 |
+
#define GGML_F32x4_REDUCE_ONE(x) \
|
| 354 |
+
(vgetq_lane_f32(x, 0) + \
|
| 355 |
+
vgetq_lane_f32(x, 1) + \
|
| 356 |
+
vgetq_lane_f32(x, 2) + \
|
| 357 |
+
vgetq_lane_f32(x, 3))
|
| 358 |
+
#endif
|
| 359 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 360 |
+
{ \
|
| 361 |
+
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
| 362 |
+
x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
|
| 363 |
+
} \
|
| 364 |
+
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
|
| 365 |
+
x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
|
| 366 |
+
} \
|
| 367 |
+
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
|
| 368 |
+
x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
|
| 369 |
+
} \
|
| 370 |
+
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
#define GGML_F32_VEC GGML_F32x4
|
| 374 |
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
| 375 |
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
| 376 |
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
| 377 |
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
| 378 |
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
| 379 |
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
| 380 |
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
| 381 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 382 |
+
|
| 383 |
+
// F16 NEON
|
| 384 |
|
| 385 |
+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
| 386 |
+
#define GGML_F16_STEP 32
|
| 387 |
+
#define GGML_F16_EPR 8
|
| 388 |
+
|
| 389 |
+
#define GGML_F16x8 float16x8_t
|
| 390 |
+
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
| 391 |
+
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
| 392 |
+
#define GGML_F16x8_LOAD vld1q_f16
|
| 393 |
+
#define GGML_F16x8_STORE vst1q_f16
|
| 394 |
+
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
| 395 |
+
#define GGML_F16x8_ADD vaddq_f16
|
| 396 |
+
#define GGML_F16x8_MUL vmulq_f16
|
| 397 |
+
#define GGML_F16x8_REDUCE(res, x) \
|
| 398 |
+
{ \
|
| 399 |
+
for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
|
| 400 |
+
x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
|
| 401 |
+
} \
|
| 402 |
+
for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
|
| 403 |
+
x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
|
| 404 |
+
} \
|
| 405 |
+
for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
|
| 406 |
+
x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
|
| 407 |
+
} \
|
| 408 |
+
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
| 409 |
+
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
| 410 |
+
res = vaddvq_f32(vaddq_f32(t0, t1)); \
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
#define GGML_F16_VEC GGML_F16x8
|
| 414 |
+
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
| 415 |
+
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
| 416 |
+
#define GGML_F16_VEC_LOAD GGML_F16x8_LOAD
|
| 417 |
+
#define GGML_F16_VEC_STORE GGML_F16x8_STORE
|
| 418 |
+
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
| 419 |
+
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
| 420 |
+
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
| 421 |
+
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
|
| 422 |
+
#else
|
| 423 |
+
// if FP16 vector arithmetic is not supported, we use FP32 instead
|
| 424 |
+
// and take advantage of the vcvt_ functions to convert to/from FP16
|
| 425 |
+
|
| 426 |
+
#define GGML_F16_STEP 16
|
| 427 |
+
#define GGML_F16_EPR 4
|
| 428 |
+
|
| 429 |
+
#define GGML_F32Cx4 float32x4_t
|
| 430 |
+
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
| 431 |
+
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
| 432 |
+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
|
| 433 |
+
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
| 434 |
+
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
| 435 |
+
#define GGML_F32Cx4_ADD vaddq_f32
|
| 436 |
+
#define GGML_F32Cx4_MUL vmulq_f32
|
| 437 |
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
| 438 |
+
|
| 439 |
+
#define GGML_F16_VEC GGML_F32Cx4
|
| 440 |
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
| 441 |
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
| 442 |
+
#define GGML_F16_VEC_LOAD GGML_F32Cx4_LOAD
|
| 443 |
+
#define GGML_F16_VEC_STORE GGML_F32Cx4_STORE
|
| 444 |
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
| 445 |
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
| 446 |
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
| 447 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
| 448 |
+
#endif
|
| 449 |
|
| 450 |
+
#elif defined(__AVX__)
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
+
#define GGML_SIMD
|
|
|
|
| 453 |
|
| 454 |
+
// F32 AVX
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
|
| 456 |
+
#define GGML_F32_STEP 32
|
| 457 |
+
#define GGML_F32_EPR 8
|
|
|
|
|
|
|
| 458 |
|
| 459 |
+
#define GGML_F32x8 __m256
|
| 460 |
+
#define GGML_F32x8_ZERO _mm256_setzero_ps()
|
| 461 |
+
#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
|
| 462 |
+
#define GGML_F32x8_LOAD _mm256_loadu_ps
|
| 463 |
+
#define GGML_F32x8_STORE _mm256_storeu_ps
|
| 464 |
+
#if defined(__FMA__)
|
| 465 |
+
#define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
|
| 466 |
+
#else
|
| 467 |
+
#define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
|
| 468 |
+
#endif
|
| 469 |
+
#define GGML_F32x8_ADD _mm256_add_ps
|
| 470 |
+
#define GGML_F32x8_MUL _mm256_mul_ps
|
| 471 |
+
#define GGML_F32x8_REDUCE(res, x) \
|
| 472 |
+
{ \
|
| 473 |
+
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
| 474 |
+
x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
|
| 475 |
+
} \
|
| 476 |
+
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
|
| 477 |
+
x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
|
| 478 |
+
} \
|
| 479 |
+
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
|
| 480 |
+
x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
|
| 481 |
+
} \
|
| 482 |
+
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
| 483 |
+
_mm256_extractf128_ps(x[0], 1)); \
|
| 484 |
+
const __m128 t1 = _mm_hadd_ps(t0, t0); \
|
| 485 |
+
res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
|
| 486 |
+
}
|
| 487 |
+
// TODO: is this optimal ?
|
| 488 |
+
|
| 489 |
+
#define GGML_F32_VEC GGML_F32x8
|
| 490 |
+
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
| 491 |
+
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
| 492 |
+
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
| 493 |
+
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
| 494 |
+
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
| 495 |
+
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
| 496 |
+
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
| 497 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
| 498 |
+
|
| 499 |
+
// F16 AVX
|
| 500 |
+
|
| 501 |
+
#define GGML_F16_STEP 32
|
| 502 |
+
#define GGML_F16_EPR 8
|
| 503 |
+
|
| 504 |
+
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
| 505 |
+
// we take advantage of the _mm256_cvt intrinsics to convert F16 <-> F32
|
| 506 |
+
|
| 507 |
+
#define GGML_F32Cx8 __m256
|
| 508 |
+
#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
|
| 509 |
+
#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
|
| 510 |
+
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
|
| 511 |
+
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
| 512 |
+
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
| 513 |
+
#define GGML_F32Cx8_ADD _mm256_add_ps
|
| 514 |
+
#define GGML_F32Cx8_MUL _mm256_mul_ps
|
| 515 |
+
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
| 516 |
+
|
| 517 |
+
#define GGML_F16_VEC GGML_F32Cx8
|
| 518 |
+
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
| 519 |
+
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
| 520 |
+
#define GGML_F16_VEC_LOAD GGML_F32Cx8_LOAD
|
| 521 |
+
#define GGML_F16_VEC_STORE GGML_F32Cx8_STORE
|
| 522 |
+
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
| 523 |
+
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
| 524 |
+
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
| 525 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
| 526 |
|
| 527 |
+
#elif defined(__POWER9_VECTOR__)
|
|
|
|
|
|
|
| 528 |
|
| 529 |
+
// TODO: uncomment this when it works
|
| 530 |
+
//#define GGML_SIMD
|
| 531 |
+
|
| 532 |
+
// F32 POWER9
|
| 533 |
+
|
| 534 |
+
#define GGML_F32_STEP 32
|
| 535 |
+
#define GGML_F32_EPR 8
|
| 536 |
+
|
| 537 |
+
// TODO: not tested !!
|
| 538 |
+
#define GGML_F32x4 __vector float
|
| 539 |
+
#define GGML_F32x4_ZERO (__vector float){0.0f, 0.0f, 0.0f, 0.0f}
|
| 540 |
+
#define GGML_F32x4_SET1(x) (__vector float){x, x, x, x}
|
| 541 |
+
#define GGML_F32x4_LOAD vec_vsx_ld
|
| 542 |
+
#define GGML_F32x4_STORE vec_vsx_st
|
| 543 |
+
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
| 544 |
+
#define GGML_F32x4_ADD vec_add
|
| 545 |
+
#define GGML_F32x4_MUL vec_mul
|
| 546 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 547 |
+
{ \
|
| 548 |
+
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
| 549 |
+
x[2*i] = vec_add(x[2*i], x[2*i+1]); \
|
| 550 |
+
} \
|
| 551 |
+
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
|
| 552 |
+
x[4*i] = vec_add(x[4*i], x[4*i+2]); \
|
| 553 |
+
} \
|
| 554 |
+
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
|
| 555 |
+
x[8*i] = vec_add(x[8*i], x[8*i+4]); \
|
| 556 |
+
} \
|
| 557 |
+
res = vec_extract(x[0], 0) + \
|
| 558 |
+
vec_extract(x[0], 1) + \
|
| 559 |
+
vec_extract(x[0], 2) + \
|
| 560 |
+
vec_extract(x[0], 3); \
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
#define GGML_F32_VEC GGML_F32x4
|
| 564 |
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
| 565 |
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
| 566 |
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
| 567 |
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
| 568 |
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
| 569 |
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
| 570 |
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
| 571 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 572 |
+
|
| 573 |
+
// F16 POWER9
|
| 574 |
+
// TODO: implement here
|
| 575 |
+
// ...
|
| 576 |
|
| 577 |
+
#elif defined(__wasm_simd128__)
|
| 578 |
|
| 579 |
+
#define GGML_SIMD
|
| 580 |
+
|
| 581 |
+
// F32 WASM
|
| 582 |
+
|
| 583 |
+
#define GGML_F32_STEP 16
|
| 584 |
+
#define GGML_F32_EPR 4
|
| 585 |
+
|
| 586 |
+
#define GGML_F32x4 v128_t
|
| 587 |
+
#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
|
| 588 |
+
#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
|
| 589 |
+
#define GGML_F32x4_LOAD wasm_v128_load
|
| 590 |
+
#define GGML_F32x4_STORE wasm_v128_store
|
| 591 |
+
#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
|
| 592 |
+
#define GGML_F32x4_ADD wasm_f32x4_add
|
| 593 |
+
#define GGML_F32x4_MUL wasm_f32x4_mul
|
| 594 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 595 |
+
{ \
|
| 596 |
+
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
| 597 |
+
x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
|
| 598 |
+
} \
|
| 599 |
+
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
|
| 600 |
+
x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
|
| 601 |
+
} \
|
| 602 |
+
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
|
| 603 |
+
x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
|
| 604 |
+
} \
|
| 605 |
+
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
| 606 |
+
wasm_f32x4_extract_lane(x[0], 1) + \
|
| 607 |
+
wasm_f32x4_extract_lane(x[0], 2) + \
|
| 608 |
+
wasm_f32x4_extract_lane(x[0], 3); \
|
| 609 |
+
}
|
| 610 |
+
|
| 611 |
+
#define GGML_F32_VEC GGML_F32x4
|
| 612 |
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
| 613 |
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
| 614 |
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
| 615 |
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
| 616 |
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
| 617 |
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
| 618 |
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
| 619 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 620 |
+
|
| 621 |
+
// F16 WASM
|
| 622 |
+
|
| 623 |
+
#define GGML_F16_STEP 16
|
| 624 |
+
#define GGML_F16_EPR 4
|
| 625 |
+
|
| 626 |
+
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
|
| 627 |
+
float tmp[4];
|
| 628 |
+
|
| 629 |
+
tmp[0] = GGML_FP16_TO_FP32(p[0]);
|
| 630 |
+
tmp[1] = GGML_FP16_TO_FP32(p[1]);
|
| 631 |
+
tmp[2] = GGML_FP16_TO_FP32(p[2]);
|
| 632 |
+
tmp[3] = GGML_FP16_TO_FP32(p[3]);
|
| 633 |
+
|
| 634 |
+
return wasm_v128_load(tmp);
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
| 638 |
+
float tmp[4];
|
| 639 |
+
|
| 640 |
+
wasm_v128_store(tmp, x);
|
| 641 |
+
|
| 642 |
+
p[0] = GGML_FP32_TO_FP16(tmp[0]);
|
| 643 |
+
p[1] = GGML_FP32_TO_FP16(tmp[1]);
|
| 644 |
+
p[2] = GGML_FP32_TO_FP16(tmp[2]);
|
| 645 |
+
p[3] = GGML_FP32_TO_FP16(tmp[3]);
|
| 646 |
+
}
|
| 647 |
+
|
| 648 |
+
#define GGML_F16x4 v128_t
|
| 649 |
+
#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
|
| 650 |
+
#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
|
| 651 |
+
#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
|
| 652 |
+
#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
|
| 653 |
+
#define GGML_F16x4_FMA GGML_F32x4_FMA
|
| 654 |
+
#define GGML_F16x4_ADD wasm_f32x4_add
|
| 655 |
+
#define GGML_F16x4_MUL wasm_f32x4_mul
|
| 656 |
+
#define GGML_F16x4_REDUCE(res, x) \
|
| 657 |
+
{ \
|
| 658 |
+
for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
|
| 659 |
+
x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
|
| 660 |
+
} \
|
| 661 |
+
for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
|
| 662 |
+
x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
|
| 663 |
+
} \
|
| 664 |
+
for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
|
| 665 |
+
x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
|
| 666 |
+
} \
|
| 667 |
+
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
| 668 |
+
wasm_f32x4_extract_lane(x[0], 1) + \
|
| 669 |
+
wasm_f32x4_extract_lane(x[0], 2) + \
|
| 670 |
+
wasm_f32x4_extract_lane(x[0], 3); \
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
#define GGML_F16_VEC GGML_F16x4
|
| 674 |
+
#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
|
| 675 |
+
#define GGML_F16_VEC_SET1 GGML_F16x4_SET1
|
| 676 |
+
#define GGML_F16_VEC_LOAD GGML_F16x4_LOAD
|
| 677 |
+
#define GGML_F16_VEC_STORE GGML_F16x4_STORE
|
| 678 |
+
#define GGML_F16_VEC_FMA GGML_F16x4_FMA
|
| 679 |
+
#define GGML_F16_VEC_ADD GGML_F16x4_ADD
|
| 680 |
+
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
|
| 681 |
+
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
|
| 682 |
|
| 683 |
+
#endif
|
|
|
|
|
|
|
|
|
|
| 684 |
|
| 685 |
+
// GGML_F32_ARR / GGML_F16_ARR
|
| 686 |
+
// number of registers to use per step
|
| 687 |
+
#ifdef GGML_SIMD
|
| 688 |
+
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
|
| 689 |
+
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
| 690 |
+
#endif
|
| 691 |
|
| 692 |
+
//
|
| 693 |
+
// fundamental operations
|
| 694 |
+
//
|
|
|
|
|
|
|
| 695 |
|
| 696 |
+
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
|
|
|
|
|
|
|
|
| 697 |
|
| 698 |
+
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
|
| 700 |
+
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
|
|
|
|
|
| 701 |
|
| 702 |
+
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
|
|
|
|
|
| 703 |
|
| 704 |
+
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
| 705 |
+
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
| 706 |
+
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
|
| 707 |
+
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
| 708 |
+
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 709 |
+
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
|
| 710 |
+
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
| 711 |
+
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
| 712 |
+
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
| 713 |
|
| 714 |
+
inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
| 715 |
+
ggml_float sumf = 0.0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 716 |
|
| 717 |
+
#ifdef GGML_SIMD
|
| 718 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
|
|
|
|
|
| 719 |
|
| 720 |
+
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
|
|
|
| 721 |
|
| 722 |
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
| 723 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
|
|
|
|
|
|
|
|
| 724 |
|
| 725 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 726 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 727 |
+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
| 728 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 729 |
|
| 730 |
+
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
|
| 731 |
+
}
|
|
|
|
|
|
|
| 732 |
}
|
| 733 |
|
| 734 |
+
// reduce sum0..sum3 to sum0
|
| 735 |
+
GGML_F32_VEC_REDUCE(sumf, sum);
|
|
|
|
|
|
|
|
|
|
| 736 |
|
| 737 |
// leftovers
|
| 738 |
+
for (int i = np; i < n; ++i) {
|
| 739 |
sumf += x[i]*y[i];
|
| 740 |
}
|
| 741 |
#else
|
|
|
|
| 750 |
|
| 751 |
inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
| 752 |
ggml_float sumf = 0.0;
|
|
|
|
|
|
|
| 753 |
|
| 754 |
+
#if defined(GGML_SIMD)
|
| 755 |
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
|
|
|
|
|
|
|
|
| 756 |
|
| 757 |
+
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
|
|
|
|
| 758 |
|
| 759 |
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
| 760 |
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
|
|
|
|
|
|
|
|
| 761 |
|
| 762 |
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
| 763 |
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
| 764 |
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR);
|
| 765 |
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR);
|
| 766 |
|
| 767 |
+
sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
| 768 |
+
}
|
|
|
|
|
|
|
| 769 |
}
|
| 770 |
|
| 771 |
// reduce sum0..sum3 to sum0
|
| 772 |
+
GGML_F16_VEC_REDUCE(sumf, sum);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
|
| 774 |
// leftovers
|
| 775 |
+
for (int i = np; i < n; ++i) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 777 |
}
|
| 778 |
#elif defined(__POWER9_VECTOR__)
|
| 779 |
+
// TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
|
| 780 |
+
// being able to test it. hoping someone with access to a POWER9 machine can help out here.
|
| 781 |
const int n32 = (n & ~31);
|
| 782 |
|
| 783 |
vector float sum0 = vec_splats (0.0f);
|
|
|
|
| 828 |
for (int i = n32; i < n; ++i) {
|
| 829 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 830 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 831 |
#else
|
| 832 |
for (int i = 0; i < n; ++i) {
|
| 833 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
|
|
|
| 838 |
}
|
| 839 |
|
| 840 |
inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
|
| 841 |
+
#if defined(GGML_SIMD)
|
| 842 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 843 |
|
| 844 |
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 845 |
|
| 846 |
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
| 847 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
|
|
|
|
|
| 848 |
|
| 849 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 850 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 851 |
+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
| 852 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 853 |
+
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
| 854 |
|
| 855 |
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
| 856 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
}
|
| 858 |
|
| 859 |
// leftovers
|
| 860 |
+
for (int i = np; i < n; ++i) {
|
| 861 |
y[i] += x[i]*v;
|
| 862 |
}
|
| 863 |
#else
|
|
|
|
| 869 |
}
|
| 870 |
|
| 871 |
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) {
|
| 872 |
+
#if defined(GGML_SIMD)
|
| 873 |
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 874 |
|
| 875 |
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
|
|
|
| 876 |
|
| 877 |
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
| 878 |
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 879 |
|
| 880 |
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
| 881 |
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
| 882 |
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR);
|
| 883 |
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR);
|
| 884 |
+
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
| 885 |
|
| 886 |
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay[j]);
|
| 887 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 888 |
}
|
|
|
|
| 889 |
|
| 890 |
// leftovers
|
| 891 |
+
for (int i = np; i < n; ++i) {
|
| 892 |
GGML_ASSERT(false);
|
| 893 |
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 894 |
}
|
| 895 |
+
#elif defined(__POWER9_VECTOR__)
|
| 896 |
+
// TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
|
| 897 |
+
// being able to test it. hoping someone with access to a POWER9 machine can help out here.
|
| 898 |
const int n32 = (n & ~31);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 899 |
for (int i = 0; i < n32; i += 32) {
|
| 900 |
+
// Use vec_xl, not vec_ld, because x is sometimes unaligned!
|
| 901 |
+
vector unsigned short x0 = vec_xl(i * 2 + 0, x);
|
| 902 |
+
vector unsigned short x1 = vec_xl(i * 2 + 16, x);
|
| 903 |
+
vector unsigned short x2 = vec_xl(i * 2 + 32, x);
|
| 904 |
+
vector unsigned short x3 = vec_xl(i * 2 + 48, x);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 905 |
|
| 906 |
+
vector unsigned short y0 = vec_xl(i * 2 + 0, y);
|
| 907 |
+
vector unsigned short y1 = vec_xl(i * 2 + 16, y);
|
| 908 |
+
vector unsigned short y2 = vec_xl(i * 2 + 32, y);
|
| 909 |
+
vector unsigned short y3 = vec_xl(i * 2 + 48, y);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 910 |
|
| 911 |
+
vector float v4 = vec_splats(v);
|
| 912 |
|
| 913 |
+
vector float fx0l = vec_extract_fp32_from_shortl(x0);
|
| 914 |
+
vector float fx0h = vec_extract_fp32_from_shorth(x0);
|
| 915 |
+
vector float fx1l = vec_extract_fp32_from_shortl(x1);
|
| 916 |
+
vector float fx1h = vec_extract_fp32_from_shorth(x1);
|
| 917 |
+
vector float fx2l = vec_extract_fp32_from_shortl(x2);
|
| 918 |
+
vector float fx2h = vec_extract_fp32_from_shorth(x2);
|
| 919 |
+
vector float fx3l = vec_extract_fp32_from_shortl(x3);
|
| 920 |
+
vector float fx3h = vec_extract_fp32_from_shorth(x3);
|
| 921 |
|
| 922 |
+
vector float fy0l = vec_extract_fp32_from_shortl(y0);
|
| 923 |
+
vector float fy0h = vec_extract_fp32_from_shorth(y0);
|
| 924 |
+
vector float fy1l = vec_extract_fp32_from_shortl(y1);
|
| 925 |
+
vector float fy1h = vec_extract_fp32_from_shorth(y1);
|
| 926 |
+
vector float fy2l = vec_extract_fp32_from_shortl(y2);
|
| 927 |
+
vector float fy2h = vec_extract_fp32_from_shorth(y2);
|
| 928 |
+
vector float fy3l = vec_extract_fp32_from_shortl(y3);
|
| 929 |
+
vector float fy3h = vec_extract_fp32_from_shorth(y3);
|
| 930 |
|
| 931 |
+
fy0l = vec_madd(fx0l, v4, fy0l);
|
| 932 |
+
fy0h = vec_madd(fx0h, v4, fy0h);
|
| 933 |
+
fy1l = vec_madd(fx1l, v4, fy1l);
|
| 934 |
+
fy1h = vec_madd(fx1h, v4, fy1h);
|
| 935 |
+
fy2l = vec_madd(fx2l, v4, fy2l);
|
| 936 |
+
fy2h = vec_madd(fx2h, v4, fy2h);
|
| 937 |
+
fy3l = vec_madd(fx3l, v4, fy3l);
|
| 938 |
+
fy3h = vec_madd(fx3h, v4, fy3h);
|
| 939 |
|
| 940 |
+
y0 = vec_pack_to_short_fp32(fy0h, fy0l);
|
| 941 |
+
y1 = vec_pack_to_short_fp32(fy1h, fy1l);
|
| 942 |
+
y2 = vec_pack_to_short_fp32(fy2h, fy2l);
|
| 943 |
+
y3 = vec_pack_to_short_fp32(fy3h, fy3l);
|
| 944 |
|
| 945 |
+
vec_xst(y0, i * 2 + 0, y);
|
| 946 |
+
vec_xst(y1, i * 2 + 16, y);
|
| 947 |
+
vec_xst(y2, i * 2 + 32, y);
|
| 948 |
+
vec_xst(y3, i * 2 + 48, y);
|
| 949 |
}
|
| 950 |
|
|
|
|
| 951 |
for (int i = n32; i < n; ++i) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 952 |
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 953 |
}
|
| 954 |
#else
|
|
|
|
| 960 |
|
| 961 |
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
|
| 962 |
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
| 963 |
+
#if defined(GGML_SIMD)
|
| 964 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
|
|
| 965 |
|
| 966 |
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
| 967 |
|
| 968 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 969 |
|
| 970 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 971 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 972 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 973 |
+
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
|
| 974 |
|
| 975 |
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
| 976 |
+
}
|
|
|
|
|
|
|
| 977 |
}
|
| 978 |
|
| 979 |
// leftovers
|
| 980 |
+
for (int i = np; i < n; ++i) {
|
| 981 |
y[i] *= v;
|
| 982 |
}
|
| 983 |
#else
|
|
|
|
| 8255 |
#endif
|
| 8256 |
}
|
| 8257 |
|
| 8258 |
+
int ggml_cpu_has_fma(void) {
|
| 8259 |
+
#if defined(__FMA__)
|
| 8260 |
+
return 1;
|
| 8261 |
+
#else
|
| 8262 |
+
return 0;
|
| 8263 |
+
#endif
|
| 8264 |
+
}
|
| 8265 |
+
|
| 8266 |
int ggml_cpu_has_neon(void) {
|
| 8267 |
#if defined(__ARM_NEON)
|
| 8268 |
return 1;
|
ggml.h
CHANGED
|
@@ -724,6 +724,7 @@ enum ggml_opt_result ggml_opt(
|
|
| 724 |
int ggml_cpu_has_avx(void);
|
| 725 |
int ggml_cpu_has_avx2(void);
|
| 726 |
int ggml_cpu_has_avx512(void);
|
|
|
|
| 727 |
int ggml_cpu_has_neon(void);
|
| 728 |
int ggml_cpu_has_arm_fma(void);
|
| 729 |
int ggml_cpu_has_f16c(void);
|
|
|
|
| 724 |
int ggml_cpu_has_avx(void);
|
| 725 |
int ggml_cpu_has_avx2(void);
|
| 726 |
int ggml_cpu_has_avx512(void);
|
| 727 |
+
int ggml_cpu_has_fma(void);
|
| 728 |
int ggml_cpu_has_neon(void);
|
| 729 |
int ggml_cpu_has_arm_fma(void);
|
| 730 |
int ggml_cpu_has_f16c(void);
|
whisper.cpp
CHANGED
|
@@ -2555,6 +2555,7 @@ const char * whisper_print_system_info(void) {
|
|
| 2555 |
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
| 2556 |
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
| 2557 |
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
|
|
|
| 2558 |
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
| 2559 |
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
| 2560 |
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
|
|
|
| 2555 |
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
| 2556 |
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
| 2557 |
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
| 2558 |
+
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
| 2559 |
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
| 2560 |
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
| 2561 |
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|