Spaces:
Sleeping
Sleeping
Thomas Fitzsimmons
commited on
Commit
·
f92a260
1
Parent(s):
821a538
ggml : improve f16 acceleration for POWER9 ppc64le
Browse files
ggml.c
CHANGED
|
@@ -781,18 +781,25 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 781 |
const int n32 = (n & ~31);
|
| 782 |
|
| 783 |
vector float sum0 = vec_splats (0.0f);
|
| 784 |
-
|
| 785 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
// Use vec_xl, not vec_ld, because x is sometimes unaligned.
|
| 787 |
-
vector unsigned short x0 = vec_xl(
|
| 788 |
-
vector unsigned short x1 = vec_xl(
|
| 789 |
-
vector unsigned short x2 = vec_xl(
|
| 790 |
-
vector unsigned short x3 = vec_xl(
|
| 791 |
|
| 792 |
-
vector unsigned short y0 =
|
| 793 |
-
vector unsigned short y1 =
|
| 794 |
-
vector unsigned short y2 =
|
| 795 |
-
vector unsigned short y3 =
|
| 796 |
|
| 797 |
vector float fx0l = vec_extract_fp32_from_shortl(x0);
|
| 798 |
vector float fx0h = vec_extract_fp32_from_shorth(x0);
|
|
@@ -812,16 +819,26 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 812 |
vector float fy3l = vec_extract_fp32_from_shortl(y3);
|
| 813 |
vector float fy3h = vec_extract_fp32_from_shorth(y3);
|
| 814 |
|
| 815 |
-
sum0 =
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
}
|
| 824 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 825 |
sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
|
| 826 |
+ vec_extract(sum0, 2) + vec_extract(sum0, 3);
|
| 827 |
|
|
@@ -896,17 +913,17 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 896 |
// TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
|
| 897 |
// being able to test it. hoping someone with access to a POWER9 machine can help out here.
|
| 898 |
const int n32 = (n & ~31);
|
| 899 |
-
for (int i = 0; i < n32; i += 32) {
|
| 900 |
// Use vec_xl, not vec_ld, because x is sometimes unaligned!
|
| 901 |
-
vector unsigned short x0 = vec_xl(
|
| 902 |
-
vector unsigned short x1 = vec_xl(
|
| 903 |
-
vector unsigned short x2 = vec_xl(
|
| 904 |
-
vector unsigned short x3 = vec_xl(
|
| 905 |
|
| 906 |
-
vector unsigned short y0 = vec_xl(
|
| 907 |
-
vector unsigned short y1 = vec_xl(
|
| 908 |
-
vector unsigned short y2 = vec_xl(
|
| 909 |
-
vector unsigned short y3 = vec_xl(
|
| 910 |
|
| 911 |
vector float v4 = vec_splats(v);
|
| 912 |
|
|
@@ -942,10 +959,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 942 |
y2 = vec_pack_to_short_fp32(fy2h, fy2l);
|
| 943 |
y3 = vec_pack_to_short_fp32(fy3h, fy3l);
|
| 944 |
|
| 945 |
-
vec_xst(y0,
|
| 946 |
-
vec_xst(y1,
|
| 947 |
-
vec_xst(y2,
|
| 948 |
-
vec_xst(y3,
|
| 949 |
}
|
| 950 |
|
| 951 |
for (int i = n32; i < n; ++i) {
|
|
|
|
| 781 |
const int n32 = (n & ~31);
|
| 782 |
|
| 783 |
vector float sum0 = vec_splats (0.0f);
|
| 784 |
+
vector float sum1 = vec_splats (0.0f);
|
| 785 |
+
vector float sum2 = vec_splats (0.0f);
|
| 786 |
+
vector float sum3 = vec_splats (0.0f);
|
| 787 |
+
vector float sum4 = vec_splats (0.0f);
|
| 788 |
+
vector float sum5 = vec_splats (0.0f);
|
| 789 |
+
vector float sum6 = vec_splats (0.0f);
|
| 790 |
+
vector float sum7 = vec_splats (0.0f);
|
| 791 |
+
|
| 792 |
+
for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
|
| 793 |
// Use vec_xl, not vec_ld, because x is sometimes unaligned.
|
| 794 |
+
vector unsigned short x0 = vec_xl(j + 0, x);
|
| 795 |
+
vector unsigned short x1 = vec_xl(j + 16, x);
|
| 796 |
+
vector unsigned short x2 = vec_xl(j + 32, x);
|
| 797 |
+
vector unsigned short x3 = vec_xl(j + 48, x);
|
| 798 |
|
| 799 |
+
vector unsigned short y0 = vec_ld(j + 0, y);
|
| 800 |
+
vector unsigned short y1 = vec_ld(j + 16, y);
|
| 801 |
+
vector unsigned short y2 = vec_ld(j + 32, y);
|
| 802 |
+
vector unsigned short y3 = vec_ld(j + 48, y);
|
| 803 |
|
| 804 |
vector float fx0l = vec_extract_fp32_from_shortl(x0);
|
| 805 |
vector float fx0h = vec_extract_fp32_from_shorth(x0);
|
|
|
|
| 819 |
vector float fy3l = vec_extract_fp32_from_shortl(y3);
|
| 820 |
vector float fy3h = vec_extract_fp32_from_shorth(y3);
|
| 821 |
|
| 822 |
+
sum0 = vec_madd(fx0l, fy0l, sum0);
|
| 823 |
+
sum1 = vec_madd(fx0h, fy0h, sum1);
|
| 824 |
+
sum2 = vec_madd(fx1l, fy1l, sum2);
|
| 825 |
+
sum3 = vec_madd(fx1h, fy1h, sum3);
|
| 826 |
+
sum4 = vec_madd(fx2l, fy2l, sum4);
|
| 827 |
+
sum5 = vec_madd(fx2h, fy2h, sum5);
|
| 828 |
+
sum6 = vec_madd(fx3l, fy3l, sum6);
|
| 829 |
+
sum7 = vec_madd(fx3h, fy3h, sum7);
|
| 830 |
}
|
| 831 |
|
| 832 |
+
sum0 = vec_add(sum0, sum1);
|
| 833 |
+
sum2 = vec_add(sum2, sum3);
|
| 834 |
+
sum4 = vec_add(sum4, sum5);
|
| 835 |
+
sum6 = vec_add(sum6, sum7);
|
| 836 |
+
|
| 837 |
+
sum0 = vec_add(sum0, sum2);
|
| 838 |
+
sum4 = vec_add(sum4, sum6);
|
| 839 |
+
|
| 840 |
+
sum0 = vec_add(sum0, sum4);
|
| 841 |
+
|
| 842 |
sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
|
| 843 |
+ vec_extract(sum0, 2) + vec_extract(sum0, 3);
|
| 844 |
|
|
|
|
| 913 |
// TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
|
| 914 |
// being able to test it. hoping someone with access to a POWER9 machine can help out here.
|
| 915 |
const int n32 = (n & ~31);
|
| 916 |
+
for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
|
| 917 |
// Use vec_xl, not vec_ld, because x is sometimes unaligned!
|
| 918 |
+
vector unsigned short x0 = vec_xl(j + 0, x);
|
| 919 |
+
vector unsigned short x1 = vec_xl(j + 16, x);
|
| 920 |
+
vector unsigned short x2 = vec_xl(j + 32, x);
|
| 921 |
+
vector unsigned short x3 = vec_xl(j + 48, x);
|
| 922 |
|
| 923 |
+
vector unsigned short y0 = vec_xl(j + 0, y);
|
| 924 |
+
vector unsigned short y1 = vec_xl(j + 16, y);
|
| 925 |
+
vector unsigned short y2 = vec_xl(j + 32, y);
|
| 926 |
+
vector unsigned short y3 = vec_xl(j + 48, y);
|
| 927 |
|
| 928 |
vector float v4 = vec_splats(v);
|
| 929 |
|
|
|
|
| 959 |
y2 = vec_pack_to_short_fp32(fy2h, fy2l);
|
| 960 |
y3 = vec_pack_to_short_fp32(fy3h, fy3l);
|
| 961 |
|
| 962 |
+
vec_xst(y0, j + 0, y);
|
| 963 |
+
vec_xst(y1, j + 16, y);
|
| 964 |
+
vec_xst(y2, j + 32, y);
|
| 965 |
+
vec_xst(y3, j + 48, y);
|
| 966 |
}
|
| 967 |
|
| 968 |
for (int i = n32; i < n; ++i) {
|