Spaces:
Running
Running
ggml : fix 32-bit ARM build + quantization
Browse files
Makefile
CHANGED
|
@@ -177,7 +177,7 @@ ifdef WHISPER_GPROF
|
|
| 177 |
endif
|
| 178 |
|
| 179 |
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
| 180 |
-
CFLAGS
|
| 181 |
CXXFLAGS += -mcpu=native
|
| 182 |
endif
|
| 183 |
|
|
@@ -188,15 +188,18 @@ endif
|
|
| 188 |
|
| 189 |
ifneq ($(filter armv7%,$(UNAME_M)),)
|
| 190 |
# 32-bit ARM, for example on Armbian or possibly raspbian
|
| 191 |
-
CFLAGS
|
|
|
|
| 192 |
|
| 193 |
-
# 64-bit ARM, use these (TODO: auto-detect 64-bit)
|
| 194 |
-
|
|
|
|
| 195 |
endif
|
| 196 |
|
| 197 |
ifneq ($(filter armv8%,$(UNAME_M)),)
|
| 198 |
# Raspberry Pi 4
|
| 199 |
-
CFLAGS
|
|
|
|
| 200 |
endif
|
| 201 |
|
| 202 |
#
|
|
|
|
| 177 |
endif
|
| 178 |
|
| 179 |
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
| 180 |
+
CFLAGS += -mcpu=native
|
| 181 |
CXXFLAGS += -mcpu=native
|
| 182 |
endif
|
| 183 |
|
|
|
|
| 188 |
|
| 189 |
ifneq ($(filter armv7%,$(UNAME_M)),)
|
| 190 |
# 32-bit ARM, for example on Armbian or possibly raspbian
|
| 191 |
+
#CFLAGS += -mfpu=neon -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
|
| 192 |
+
#CXXFLAGS += -mfpu=neon -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
|
| 193 |
|
| 194 |
+
# 64-bit ARM on 32-bit OS, use these (TODO: auto-detect 64-bit)
|
| 195 |
+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
|
| 196 |
+
CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
|
| 197 |
endif
|
| 198 |
|
| 199 |
ifneq ($(filter armv8%,$(UNAME_M)),)
|
| 200 |
# Raspberry Pi 4
|
| 201 |
+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
|
| 202 |
+
CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
|
| 203 |
endif
|
| 204 |
|
| 205 |
#
|
ggml.c
CHANGED
|
@@ -671,35 +671,91 @@ float vmaxvq_f32(float32x4_t v) {
|
|
| 671 |
}
|
| 672 |
|
| 673 |
int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
|
| 674 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
}
|
| 676 |
|
| 677 |
int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
|
| 678 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 679 |
}
|
| 680 |
|
| 681 |
uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
| 682 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
}
|
| 684 |
|
| 685 |
uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
| 686 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 687 |
}
|
| 688 |
|
| 689 |
int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
|
| 690 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
}
|
| 692 |
|
| 693 |
int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
|
| 694 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
}
|
| 696 |
|
| 697 |
uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
|
| 698 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
}
|
| 700 |
|
| 701 |
uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
|
| 702 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 703 |
}
|
| 704 |
|
| 705 |
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
|
|
| 671 |
}
|
| 672 |
|
| 673 |
int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
|
| 674 |
+
int8x8_t res;
|
| 675 |
+
|
| 676 |
+
res[0] = a[0]; res[1] = b[0];
|
| 677 |
+
res[2] = a[1]; res[3] = b[1];
|
| 678 |
+
res[4] = a[2]; res[5] = b[2];
|
| 679 |
+
res[6] = a[3]; res[7] = b[3];
|
| 680 |
+
|
| 681 |
+
return res;
|
| 682 |
}
|
| 683 |
|
| 684 |
int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
|
| 685 |
+
int8x8_t res;
|
| 686 |
+
|
| 687 |
+
res[0] = a[4]; res[1] = b[4];
|
| 688 |
+
res[2] = a[5]; res[3] = b[5];
|
| 689 |
+
res[4] = a[6]; res[5] = b[6];
|
| 690 |
+
res[6] = a[7]; res[7] = b[7];
|
| 691 |
+
|
| 692 |
+
return res;
|
| 693 |
}
|
| 694 |
|
| 695 |
uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
| 696 |
+
uint8x8_t res;
|
| 697 |
+
|
| 698 |
+
res[0] = a[0]; res[1] = b[0];
|
| 699 |
+
res[2] = a[1]; res[3] = b[1];
|
| 700 |
+
res[4] = a[2]; res[5] = b[2];
|
| 701 |
+
res[6] = a[3]; res[7] = b[3];
|
| 702 |
+
|
| 703 |
+
return res;
|
| 704 |
}
|
| 705 |
|
| 706 |
uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
| 707 |
+
uint8x8_t res;
|
| 708 |
+
|
| 709 |
+
res[0] = a[4]; res[1] = b[4];
|
| 710 |
+
res[2] = a[5]; res[3] = b[5];
|
| 711 |
+
res[4] = a[6]; res[5] = b[6];
|
| 712 |
+
res[6] = a[7]; res[7] = b[7];
|
| 713 |
+
|
| 714 |
+
return res;
|
| 715 |
}
|
| 716 |
|
| 717 |
int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
|
| 718 |
+
int8x16_t res;
|
| 719 |
+
|
| 720 |
+
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
|
| 721 |
+
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
|
| 722 |
+
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
|
| 723 |
+
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
|
| 724 |
+
|
| 725 |
+
return res;
|
| 726 |
}
|
| 727 |
|
| 728 |
int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
|
| 729 |
+
int8x16_t res;
|
| 730 |
+
|
| 731 |
+
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
|
| 732 |
+
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
|
| 733 |
+
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
|
| 734 |
+
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
|
| 735 |
+
|
| 736 |
+
return res;
|
| 737 |
}
|
| 738 |
|
| 739 |
uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
|
| 740 |
+
uint8x16_t res;
|
| 741 |
+
|
| 742 |
+
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
|
| 743 |
+
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
|
| 744 |
+
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
|
| 745 |
+
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
|
| 746 |
+
|
| 747 |
+
return res;
|
| 748 |
}
|
| 749 |
|
| 750 |
uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
|
| 751 |
+
uint8x16_t res;
|
| 752 |
+
|
| 753 |
+
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
|
| 754 |
+
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
|
| 755 |
+
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
|
| 756 |
+
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
|
| 757 |
+
|
| 758 |
+
return res;
|
| 759 |
}
|
| 760 |
|
| 761 |
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|