ggerganov commited on
Commit
87ee234
·
1 Parent(s): f50d3b3

ggml : fix 32-bit ARM build + quantization

Browse files
Files changed (2) hide show
  1. Makefile +8 -5
  2. ggml.c +64 -8
Makefile CHANGED
@@ -177,7 +177,7 @@ ifdef WHISPER_GPROF
177
  endif
178
 
179
  ifneq ($(filter aarch64%,$(UNAME_M)),)
180
- CFLAGS += -mcpu=native
181
  CXXFLAGS += -mcpu=native
182
  endif
183
 
@@ -188,15 +188,18 @@ endif
188
 
189
  ifneq ($(filter armv7%,$(UNAME_M)),)
190
  # 32-bit ARM, for example on Armbian or possibly raspbian
191
- CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 
192
 
193
- # 64-bit ARM, use these (TODO: auto-detect 64-bit)
194
- # CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 
195
  endif
196
 
197
  ifneq ($(filter armv8%,$(UNAME_M)),)
198
  # Raspberry Pi 4
199
- CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 
200
  endif
201
 
202
  #
 
177
  endif
178
 
179
  ifneq ($(filter aarch64%,$(UNAME_M)),)
180
+ CFLAGS += -mcpu=native
181
  CXXFLAGS += -mcpu=native
182
  endif
183
 
 
188
 
189
  ifneq ($(filter armv7%,$(UNAME_M)),)
190
  # 32-bit ARM, for example on Armbian or possibly raspbian
191
+ #CFLAGS += -mfpu=neon -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
192
+ #CXXFLAGS += -mfpu=neon -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
193
 
194
+ # 64-bit ARM on 32-bit OS, use these (TODO: auto-detect 64-bit)
195
+ CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
196
+ CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
197
  endif
198
 
199
  ifneq ($(filter armv8%,$(UNAME_M)),)
200
  # Raspberry Pi 4
201
+ CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
202
+ CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
203
  endif
204
 
205
  #
ggml.c CHANGED
@@ -671,35 +671,91 @@ float vmaxvq_f32(float32x4_t v) {
671
  }
672
 
673
  int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
674
- return vget_low_s8(vcombine_s8(a, b));
 
 
 
 
 
 
 
675
  }
676
 
677
  int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
678
- return vget_high_s8(vcombine_s8(a, b));
 
 
 
 
 
 
 
679
  }
680
 
681
  uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
682
- return vget_low_u8(vcombine_u8(a, b));
 
 
 
 
 
 
 
683
  }
684
 
685
  uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
686
- return vget_high_u8(vcombine_u8(a, b));
 
 
 
 
 
 
 
687
  }
688
 
689
  int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
690
- return vcombine_s8(vget_low_s8(a), vget_low_s8(b));
 
 
 
 
 
 
 
691
  }
692
 
693
  int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
694
- return vcombine_s8(vget_high_s8(a), vget_high_s8(b));
 
 
 
 
 
 
 
695
  }
696
 
697
  uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
698
- return vcombine_u8(vget_low_u8(a), vget_low_u8(b));
 
 
 
 
 
 
 
699
  }
700
 
701
  uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
702
- return vcombine_u8(vget_high_u8(a), vget_high_u8(b));
 
 
 
 
 
 
 
703
  }
704
 
705
  int32x4_t vcvtnq_s32_f32(float32x4_t v) {
 
671
  }
672
 
673
  int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
674
+ int8x8_t res;
675
+
676
+ res[0] = a[0]; res[1] = b[0];
677
+ res[2] = a[1]; res[3] = b[1];
678
+ res[4] = a[2]; res[5] = b[2];
679
+ res[6] = a[3]; res[7] = b[3];
680
+
681
+ return res;
682
  }
683
 
684
  int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
685
+ int8x8_t res;
686
+
687
+ res[0] = a[4]; res[1] = b[4];
688
+ res[2] = a[5]; res[3] = b[5];
689
+ res[4] = a[6]; res[5] = b[6];
690
+ res[6] = a[7]; res[7] = b[7];
691
+
692
+ return res;
693
  }
694
 
695
  uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
696
+ uint8x8_t res;
697
+
698
+ res[0] = a[0]; res[1] = b[0];
699
+ res[2] = a[1]; res[3] = b[1];
700
+ res[4] = a[2]; res[5] = b[2];
701
+ res[6] = a[3]; res[7] = b[3];
702
+
703
+ return res;
704
  }
705
 
706
  uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
707
+ uint8x8_t res;
708
+
709
+ res[0] = a[4]; res[1] = b[4];
710
+ res[2] = a[5]; res[3] = b[5];
711
+ res[4] = a[6]; res[5] = b[6];
712
+ res[6] = a[7]; res[7] = b[7];
713
+
714
+ return res;
715
  }
716
 
717
  int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
718
+ int8x16_t res;
719
+
720
+ res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
721
+ res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
722
+ res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
723
+ res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
724
+
725
+ return res;
726
  }
727
 
728
  int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
729
+ int8x16_t res;
730
+
731
+ res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
732
+ res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
733
+ res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
734
+ res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
735
+
736
+ return res;
737
  }
738
 
739
  uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
740
+ uint8x16_t res;
741
+
742
+ res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
743
+ res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
744
+ res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
745
+ res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
746
+
747
+ return res;
748
  }
749
 
750
  uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
751
+ uint8x16_t res;
752
+
753
+ res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
754
+ res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
755
+ res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
756
+ res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
757
+
758
+ return res;
759
  }
760
 
761
  int32x4_t vcvtnq_s32_f32(float32x4_t v) {