Thomas Fitzsimmons commited on
Commit
0d5a830
·
1 Parent(s): f39e0be

ggml : add f16 acceleration for POWER9 ppc64le

Browse files
Files changed (2) hide show
  1. Makefile +6 -0
  2. ggml.c +114 -0
Makefile CHANGED
@@ -105,6 +105,12 @@ endif
105
  ifeq ($(UNAME_M),amd64)
106
  CFLAGS += -mavx -mavx2 -mfma -mf16c
107
  endif
 
 
 
 
 
 
108
  ifndef WHISPER_NO_ACCELERATE
109
  # Mac M1 - include Accelerate framework
110
  ifeq ($(UNAME_S),Darwin)
 
105
  ifeq ($(UNAME_M),amd64)
106
  CFLAGS += -mavx -mavx2 -mfma -mf16c
107
  endif
108
+ ifeq ($(UNAME_M),ppc64le)
109
+ POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
110
+ ifneq (,$(findstring POWER9,$(POWER9_M)))
111
+ CFLAGS += -mpower9-vector
112
+ endif
113
+ endif
114
  ifndef WHISPER_NO_ACCELERATE
115
  # Mac M1 - include Accelerate framework
116
  ifeq ($(UNAME_S),Darwin)
ggml.c CHANGED
@@ -138,8 +138,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
138
  #ifdef __wasm_simd128__
139
  #include <wasm_simd128.h>
140
  #else
 
 
 
 
 
141
  #include <immintrin.h>
142
  #endif
 
143
 
144
  #ifdef __F16C__
145
  float ggml_fp16_to_fp32(ggml_fp16_t h) {
@@ -702,6 +708,57 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
702
  //GGML_ASSERT(false);
703
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
704
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
  #elif defined(__wasm_simd128__)
706
  // WASM 128-bit
707
  const int n16 = (n & ~15);
@@ -1063,6 +1120,63 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
1063
  GGML_ASSERT(false);
1064
  y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1065
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1066
  #elif defined(__wasm_simd128__)
1067
  // WASM SIMD 128-bit
1068
  const int n16 = (n & ~15);
 
138
  #ifdef __wasm_simd128__
139
  #include <wasm_simd128.h>
140
  #else
141
+ #ifdef __POWER9_VECTOR__
142
+ #include <altivec.h>
143
+ #undef bool
144
+ #define bool _Bool
145
+ #else
146
  #include <immintrin.h>
147
  #endif
148
+ #endif
149
 
150
  #ifdef __F16C__
151
  float ggml_fp16_to_fp32(ggml_fp16_t h) {
 
708
  //GGML_ASSERT(false);
709
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
710
  }
711
+ #elif defined(__POWER9_VECTOR__)
712
+ const int n32 = (n & ~31);
713
+
714
+ vector float sum0 = vec_splats (0.0f);
715
+
716
+ for (int i = 0; i < n32; i += 32) {
717
+ // Use vec_xl, not vec_ld, because x is sometimes unaligned.
718
+ vector unsigned short x0 = vec_xl(i * 2 + 0, x);
719
+ vector unsigned short x1 = vec_xl(i * 2 + 16, x);
720
+ vector unsigned short x2 = vec_xl(i * 2 + 32, x);
721
+ vector unsigned short x3 = vec_xl(i * 2 + 48, x);
722
+
723
+ vector unsigned short y0 = vec_xl(i * 2 + 0, y);
724
+ vector unsigned short y1 = vec_xl(i * 2 + 16, y);
725
+ vector unsigned short y2 = vec_xl(i * 2 + 32, y);
726
+ vector unsigned short y3 = vec_xl(i * 2 + 48, y);
727
+
728
+ vector float fx0l = vec_extract_fp32_from_shortl(x0);
729
+ vector float fx0h = vec_extract_fp32_from_shorth(x0);
730
+ vector float fx1l = vec_extract_fp32_from_shortl(x1);
731
+ vector float fx1h = vec_extract_fp32_from_shorth(x1);
732
+ vector float fx2l = vec_extract_fp32_from_shortl(x2);
733
+ vector float fx2h = vec_extract_fp32_from_shorth(x2);
734
+ vector float fx3l = vec_extract_fp32_from_shortl(x3);
735
+ vector float fx3h = vec_extract_fp32_from_shorth(x3);
736
+
737
+ vector float fy0l = vec_extract_fp32_from_shortl(y0);
738
+ vector float fy0h = vec_extract_fp32_from_shorth(y0);
739
+ vector float fy1l = vec_extract_fp32_from_shortl(y1);
740
+ vector float fy1h = vec_extract_fp32_from_shorth(y1);
741
+ vector float fy2l = vec_extract_fp32_from_shortl(y2);
742
+ vector float fy2h = vec_extract_fp32_from_shorth(y2);
743
+ vector float fy3l = vec_extract_fp32_from_shortl(y3);
744
+ vector float fy3h = vec_extract_fp32_from_shorth(y3);
745
+
746
+ sum0 = vec_add(sum0, vec_mul(fx0l, fy0l));
747
+ sum0 = vec_add(sum0, vec_mul(fx0h, fy0h));
748
+ sum0 = vec_add(sum0, vec_mul(fx1l, fy1l));
749
+ sum0 = vec_add(sum0, vec_mul(fx1h, fy1h));
750
+ sum0 = vec_add(sum0, vec_mul(fx2l, fy2l));
751
+ sum0 = vec_add(sum0, vec_mul(fx2h, fy2h));
752
+ sum0 = vec_add(sum0, vec_mul(fx3l, fy3l));
753
+ sum0 = vec_add(sum0, vec_mul(fx3h, fy3h));
754
+ }
755
+
756
+ sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
757
+ + vec_extract(sum0, 2) + vec_extract(sum0, 3);
758
+
759
+ for (int i = n32; i < n; ++i) {
760
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
761
+ }
762
  #elif defined(__wasm_simd128__)
763
  // WASM 128-bit
764
  const int n16 = (n & ~15);
 
1120
  GGML_ASSERT(false);
1121
  y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1122
  }
1123
+ #elif defined(__POWER9_VECTOR__)
1124
+ const int n32 = (n & ~31);
1125
+ for (int i = 0; i < n32; i += 32) {
1126
+ // Use vec_xl, not vec_ld, because x is sometimes unaligned!
1127
+ vector unsigned short x0 = vec_xl(i * 2 + 0, x);
1128
+ vector unsigned short x1 = vec_xl(i * 2 + 16, x);
1129
+ vector unsigned short x2 = vec_xl(i * 2 + 32, x);
1130
+ vector unsigned short x3 = vec_xl(i * 2 + 48, x);
1131
+
1132
+ vector unsigned short y0 = vec_xl(i * 2 + 0, y);
1133
+ vector unsigned short y1 = vec_xl(i * 2 + 16, y);
1134
+ vector unsigned short y2 = vec_xl(i * 2 + 32, y);
1135
+ vector unsigned short y3 = vec_xl(i * 2 + 48, y);
1136
+
1137
+ vector float v4 = vec_splats(v);
1138
+
1139
+ vector float fx0l = vec_extract_fp32_from_shortl(x0);
1140
+ vector float fx0h = vec_extract_fp32_from_shorth(x0);
1141
+ vector float fx1l = vec_extract_fp32_from_shortl(x1);
1142
+ vector float fx1h = vec_extract_fp32_from_shorth(x1);
1143
+ vector float fx2l = vec_extract_fp32_from_shortl(x2);
1144
+ vector float fx2h = vec_extract_fp32_from_shorth(x2);
1145
+ vector float fx3l = vec_extract_fp32_from_shortl(x3);
1146
+ vector float fx3h = vec_extract_fp32_from_shorth(x3);
1147
+
1148
+ vector float fy0l = vec_extract_fp32_from_shortl(y0);
1149
+ vector float fy0h = vec_extract_fp32_from_shorth(y0);
1150
+ vector float fy1l = vec_extract_fp32_from_shortl(y1);
1151
+ vector float fy1h = vec_extract_fp32_from_shorth(y1);
1152
+ vector float fy2l = vec_extract_fp32_from_shortl(y2);
1153
+ vector float fy2h = vec_extract_fp32_from_shorth(y2);
1154
+ vector float fy3l = vec_extract_fp32_from_shortl(y3);
1155
+ vector float fy3h = vec_extract_fp32_from_shorth(y3);
1156
+
1157
+ fy0l = vec_madd(fx0l, v4, fy0l);
1158
+ fy0h = vec_madd(fx0h, v4, fy0h);
1159
+ fy1l = vec_madd(fx1l, v4, fy1l);
1160
+ fy1h = vec_madd(fx1h, v4, fy1h);
1161
+ fy2l = vec_madd(fx2l, v4, fy2l);
1162
+ fy2h = vec_madd(fx2h, v4, fy2h);
1163
+ fy3l = vec_madd(fx3l, v4, fy3l);
1164
+ fy3h = vec_madd(fx3h, v4, fy3h);
1165
+
1166
+ y0 = vec_pack_to_short_fp32(fy0h, fy0l);
1167
+ y1 = vec_pack_to_short_fp32(fy1h, fy1l);
1168
+ y2 = vec_pack_to_short_fp32(fy2h, fy2l);
1169
+ y3 = vec_pack_to_short_fp32(fy3h, fy3l);
1170
+
1171
+ vec_xst(y0, i * 2 + 0, y);
1172
+ vec_xst(y1, i * 2 + 16, y);
1173
+ vec_xst(y2, i * 2 + 32, y);
1174
+ vec_xst(y3, i * 2 + 48, y);
1175
+ }
1176
+
1177
+ for (int i = n32; i < n; ++i) {
1178
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1179
+ }
1180
  #elif defined(__wasm_simd128__)
1181
  // WASM SIMD 128-bit
1182
  const int n16 = (n & ~15);