Thomas Fitzsimmons commited on
Commit
f92a260
·
1 Parent(s): 821a538

ggml : improve f16 acceleration for POWER9 ppc64le

Browse files
Files changed (1) hide show
  1. ggml.c +48 -31
ggml.c CHANGED
@@ -781,18 +781,25 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
781
  const int n32 = (n & ~31);
782
 
783
  vector float sum0 = vec_splats (0.0f);
784
-
785
- for (int i = 0; i < n32; i += 32) {
 
 
 
 
 
 
 
786
  // Use vec_xl, not vec_ld, because x is sometimes unaligned.
787
- vector unsigned short x0 = vec_xl(i * 2 + 0, x);
788
- vector unsigned short x1 = vec_xl(i * 2 + 16, x);
789
- vector unsigned short x2 = vec_xl(i * 2 + 32, x);
790
- vector unsigned short x3 = vec_xl(i * 2 + 48, x);
791
 
792
- vector unsigned short y0 = vec_xl(i * 2 + 0, y);
793
- vector unsigned short y1 = vec_xl(i * 2 + 16, y);
794
- vector unsigned short y2 = vec_xl(i * 2 + 32, y);
795
- vector unsigned short y3 = vec_xl(i * 2 + 48, y);
796
 
797
  vector float fx0l = vec_extract_fp32_from_shortl(x0);
798
  vector float fx0h = vec_extract_fp32_from_shorth(x0);
@@ -812,16 +819,26 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
812
  vector float fy3l = vec_extract_fp32_from_shortl(y3);
813
  vector float fy3h = vec_extract_fp32_from_shorth(y3);
814
 
815
- sum0 = vec_add(sum0, vec_mul(fx0l, fy0l));
816
- sum0 = vec_add(sum0, vec_mul(fx0h, fy0h));
817
- sum0 = vec_add(sum0, vec_mul(fx1l, fy1l));
818
- sum0 = vec_add(sum0, vec_mul(fx1h, fy1h));
819
- sum0 = vec_add(sum0, vec_mul(fx2l, fy2l));
820
- sum0 = vec_add(sum0, vec_mul(fx2h, fy2h));
821
- sum0 = vec_add(sum0, vec_mul(fx3l, fy3l));
822
- sum0 = vec_add(sum0, vec_mul(fx3h, fy3h));
823
  }
824
 
 
 
 
 
 
 
 
 
 
 
825
  sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
826
  + vec_extract(sum0, 2) + vec_extract(sum0, 3);
827
 
@@ -896,17 +913,17 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
896
  // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
897
  // being able to test it. hoping someone with access to a POWER9 machine can help out here.
898
  const int n32 = (n & ~31);
899
- for (int i = 0; i < n32; i += 32) {
900
  // Use vec_xl, not vec_ld, because x is sometimes unaligned!
901
- vector unsigned short x0 = vec_xl(i * 2 + 0, x);
902
- vector unsigned short x1 = vec_xl(i * 2 + 16, x);
903
- vector unsigned short x2 = vec_xl(i * 2 + 32, x);
904
- vector unsigned short x3 = vec_xl(i * 2 + 48, x);
905
 
906
- vector unsigned short y0 = vec_xl(i * 2 + 0, y);
907
- vector unsigned short y1 = vec_xl(i * 2 + 16, y);
908
- vector unsigned short y2 = vec_xl(i * 2 + 32, y);
909
- vector unsigned short y3 = vec_xl(i * 2 + 48, y);
910
 
911
  vector float v4 = vec_splats(v);
912
 
@@ -942,10 +959,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
942
  y2 = vec_pack_to_short_fp32(fy2h, fy2l);
943
  y3 = vec_pack_to_short_fp32(fy3h, fy3l);
944
 
945
- vec_xst(y0, i * 2 + 0, y);
946
- vec_xst(y1, i * 2 + 16, y);
947
- vec_xst(y2, i * 2 + 32, y);
948
- vec_xst(y3, i * 2 + 48, y);
949
  }
950
 
951
  for (int i = n32; i < n; ++i) {
 
781
  const int n32 = (n & ~31);
782
 
783
  vector float sum0 = vec_splats (0.0f);
784
+ vector float sum1 = vec_splats (0.0f);
785
+ vector float sum2 = vec_splats (0.0f);
786
+ vector float sum3 = vec_splats (0.0f);
787
+ vector float sum4 = vec_splats (0.0f);
788
+ vector float sum5 = vec_splats (0.0f);
789
+ vector float sum6 = vec_splats (0.0f);
790
+ vector float sum7 = vec_splats (0.0f);
791
+
792
+ for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
793
  // Use vec_xl, not vec_ld, because x is sometimes unaligned.
794
+ vector unsigned short x0 = vec_xl(j + 0, x);
795
+ vector unsigned short x1 = vec_xl(j + 16, x);
796
+ vector unsigned short x2 = vec_xl(j + 32, x);
797
+ vector unsigned short x3 = vec_xl(j + 48, x);
798
 
799
+ vector unsigned short y0 = vec_ld(j + 0, y);
800
+ vector unsigned short y1 = vec_ld(j + 16, y);
801
+ vector unsigned short y2 = vec_ld(j + 32, y);
802
+ vector unsigned short y3 = vec_ld(j + 48, y);
803
 
804
  vector float fx0l = vec_extract_fp32_from_shortl(x0);
805
  vector float fx0h = vec_extract_fp32_from_shorth(x0);
 
819
  vector float fy3l = vec_extract_fp32_from_shortl(y3);
820
  vector float fy3h = vec_extract_fp32_from_shorth(y3);
821
 
822
+ sum0 = vec_madd(fx0l, fy0l, sum0);
823
+ sum1 = vec_madd(fx0h, fy0h, sum1);
824
+ sum2 = vec_madd(fx1l, fy1l, sum2);
825
+ sum3 = vec_madd(fx1h, fy1h, sum3);
826
+ sum4 = vec_madd(fx2l, fy2l, sum4);
827
+ sum5 = vec_madd(fx2h, fy2h, sum5);
828
+ sum6 = vec_madd(fx3l, fy3l, sum6);
829
+ sum7 = vec_madd(fx3h, fy3h, sum7);
830
  }
831
 
832
+ sum0 = vec_add(sum0, sum1);
833
+ sum2 = vec_add(sum2, sum3);
834
+ sum4 = vec_add(sum4, sum5);
835
+ sum6 = vec_add(sum6, sum7);
836
+
837
+ sum0 = vec_add(sum0, sum2);
838
+ sum4 = vec_add(sum4, sum6);
839
+
840
+ sum0 = vec_add(sum0, sum4);
841
+
842
  sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
843
  + vec_extract(sum0, 2) + vec_extract(sum0, 3);
844
 
 
913
  // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
914
  // being able to test it. hoping someone with access to a POWER9 machine can help out here.
915
  const int n32 = (n & ~31);
916
+ for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
917
  // Use vec_xl, not vec_ld, because x is sometimes unaligned!
918
+ vector unsigned short x0 = vec_xl(j + 0, x);
919
+ vector unsigned short x1 = vec_xl(j + 16, x);
920
+ vector unsigned short x2 = vec_xl(j + 32, x);
921
+ vector unsigned short x3 = vec_xl(j + 48, x);
922
 
923
+ vector unsigned short y0 = vec_xl(j + 0, y);
924
+ vector unsigned short y1 = vec_xl(j + 16, y);
925
+ vector unsigned short y2 = vec_xl(j + 32, y);
926
+ vector unsigned short y3 = vec_xl(j + 48, y);
927
 
928
  vector float v4 = vec_splats(v);
929
 
 
959
  y2 = vec_pack_to_short_fp32(fy2h, fy2l);
960
  y3 = vec_pack_to_short_fp32(fy3h, fy3l);
961
 
962
+ vec_xst(y0, j + 0, y);
963
+ vec_xst(y1, j + 16, y);
964
+ vec_xst(y2, j + 32, y);
965
+ vec_xst(y3, j + 48, y);
966
  }
967
 
968
  for (int i = n32; i < n; ++i) {