Dibakar Gope commited on
Commit
0062819
·
1 Parent(s): 9808fbf

ggml : add ggml-aarch64 (ggml/0)

Browse files
Files changed (2) hide show
  1. ggml/src/ggml-aarch64.c +2193 -0
  2. ggml/src/ggml-aarch64.h +39 -0
ggml/src/ggml-aarch64.c ADDED
@@ -0,0 +1,2193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
2
+ #define GGML_COMMON_IMPL_C
3
+ #include "ggml-common.h"
4
+
5
+ #include "ggml-quants.h"
6
+ #include "ggml-impl.h"
7
+
8
+ #include <math.h>
9
+ #include <string.h>
10
+ #include <assert.h>
11
+ #include <float.h>
12
+ #include <stdlib.h> // for qsort
13
+ #include <stdio.h> // for GGML_ASSERT
14
+
15
+ #include "ggml-aarch64.h"
16
+
17
+ #if defined(__GNUC__)
18
+ #pragma GCC diagnostic ignored "-Woverlength-strings"
19
+ #endif
20
+
21
+ #define UNUSED GGML_UNUSED
22
+
23
+ // Functions to create the interleaved data layout formats
24
+
25
+ // interleave 4 block_q4_0s in blocks of blck_size_interleave
26
+ // returns an interleaved block_q4_0x4
27
+ // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
28
+ // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
29
+ //
30
+ // - in : an array of block_q4_0 pointers
31
+ // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
32
+ // blck_size_interleave bytes
33
+ // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
34
+ // from bias offset form to pure sign form (this saves subtract
35
+ // operations durin unpacking)
36
+ //
37
+ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
38
+ block_q4_0x4 out;
39
+
40
+ for (int i = 0; i < 4; i++) {
41
+ out.d[i] = in[i].d;
42
+ }
43
+
44
+ for (int i = 0; i < QK4_0 * 2; i++) {
45
+ int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
46
+ int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
47
+ src_offset += (i % blck_size_interleave);
48
+
49
+ out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
50
+ }
51
+
52
+ return out;
53
+ }
54
+
55
+ // interleave 8 block_q4_0s in blocks of blck_size_interleave
56
+ // returns an interleaved block_q4_0x8
57
+ // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
58
+ // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
59
+ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
60
+ block_q4_0x8 out;
61
+
62
+ for (int i = 0; i < 8; i++) {
63
+ out.d[i] = in[i].d;
64
+ }
65
+
66
+ for (int i = 0; i < QK4_0 * 4; i++) {
67
+ int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
68
+ int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
69
+ src_offset += (i % blck_size_interleave);
70
+
71
+ out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
72
+ }
73
+
74
+ return out;
75
+ }
76
+
77
+ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
78
+ assert(QK8_0 == 32);
79
+ assert(k % QK8_0 == 0);
80
+ const int nb = k / QK8_0;
81
+
82
+ block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
83
+
84
+ #if defined(__ARM_NEON)
85
+ float32x4_t srcv[4][8];
86
+ float id[4];
87
+
88
+ for (int i = 0; i < nb; i++) {
89
+ float32x4_t asrcv[8];
90
+ float32x4_t amaxv[8];
91
+
92
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
93
+ for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
94
+ for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
95
+
96
+ for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
97
+ for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
98
+ for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
99
+
100
+ const float amax = vmaxvq_f32(amaxv[0]);
101
+
102
+ const float d = amax / ((1 << 7) - 1);
103
+ id[row_iter] = d ? 1.0f / d : 0.0f;
104
+
105
+ y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
106
+ }
107
+
108
+ for (int j = 0; j < 8; j++) {
109
+ float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
110
+ int32x4_t vi = vcvtnq_s32_f32(v);
111
+ y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
112
+ y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
113
+ y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
114
+ y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
115
+
116
+ v = vmulq_n_f32(srcv[1][j], id[1]);
117
+ vi = vcvtnq_s32_f32(v);
118
+ y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
119
+ y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
120
+ y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
121
+ y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
122
+
123
+ v = vmulq_n_f32(srcv[2][j], id[2]);
124
+ vi = vcvtnq_s32_f32(v);
125
+ y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
126
+ y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
127
+ y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
128
+ y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
129
+
130
+ v = vmulq_n_f32(srcv[3][j], id[3]);
131
+ vi = vcvtnq_s32_f32(v);
132
+ y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
133
+ y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
134
+ y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
135
+ y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
136
+ }
137
+ }
138
+ #else
139
+ // scalar
140
+ const int blck_size_interleave = 4;
141
+ float srcv[4][QK8_0];
142
+ float id[4];
143
+
144
+ for (int i = 0; i < nb; i++) {
145
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
146
+ float amax = 0.0f; // absolute max
147
+
148
+ for (int j = 0; j < QK8_0; j++) {
149
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
150
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
151
+ }
152
+
153
+ const float d = amax / ((1 << 7) - 1);
154
+ id[row_iter] = d ? 1.0f / d : 0.0f;
155
+
156
+ y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
157
+ }
158
+
159
+ for (int j = 0; j < QK8_0 * 4; j++) {
160
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
161
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
162
+ src_offset += (j % blck_size_interleave);
163
+
164
+ float x0 = srcv[src_id][src_offset] * id[src_id];
165
+ y[i].qs[j] = roundf(x0);
166
+ }
167
+ }
168
+ #endif
169
+ }
170
+
171
+ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) {
172
+ assert(QK8_0 == 32);
173
+ assert(k % QK8_0 == 0);
174
+ const int nb = k / QK8_0;
175
+
176
+ block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
177
+
178
+ #if defined(__ARM_NEON)
179
+ float32x4_t srcv[4][8];
180
+ float id[4];
181
+
182
+ for (int i = 0; i < nb; i++) {
183
+ float32x4_t asrcv[8];
184
+ float32x4_t amaxv[8];
185
+
186
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
187
+ for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
188
+ for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
189
+
190
+ for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
191
+ for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
192
+ for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
193
+
194
+ const float amax = vmaxvq_f32(amaxv[0]);
195
+
196
+ const float d = amax / ((1 << 7) - 1);
197
+ id[row_iter] = d ? 1.0f / d : 0.0f;
198
+
199
+ y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
200
+ }
201
+
202
+ for (int j = 0; j < 4; j++) {
203
+ float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
204
+ int32x4_t vi = vcvtnq_s32_f32(v);
205
+ y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
206
+ y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
207
+ y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
208
+ y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
209
+ v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
210
+ vi = vcvtnq_s32_f32(v);
211
+ y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
212
+ y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
213
+ y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
214
+ y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
215
+
216
+ v = vmulq_n_f32(srcv[1][2 * j], id[1]);
217
+ vi = vcvtnq_s32_f32(v);
218
+ y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
219
+ y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
220
+ y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
221
+ y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
222
+ v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
223
+ vi = vcvtnq_s32_f32(v);
224
+ y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
225
+ y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
226
+ y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
227
+ y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
228
+
229
+ v = vmulq_n_f32(srcv[2][2 * j], id[2]);
230
+ vi = vcvtnq_s32_f32(v);
231
+ y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
232
+ y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
233
+ y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
234
+ y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
235
+ v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
236
+ vi = vcvtnq_s32_f32(v);
237
+ y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
238
+ y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
239
+ y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
240
+ y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
241
+
242
+ v = vmulq_n_f32(srcv[3][2 * j], id[3]);
243
+ vi = vcvtnq_s32_f32(v);
244
+ y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
245
+ y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
246
+ y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
247
+ y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
248
+ v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
249
+ vi = vcvtnq_s32_f32(v);
250
+ y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
251
+ y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
252
+ y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
253
+ y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
254
+ }
255
+ }
256
+ #else
257
+ // scalar
258
+ const int blck_size_interleave = 8;
259
+ float srcv[4][QK8_0];
260
+ float id[4];
261
+
262
+ for (int i = 0; i < nb; i++) {
263
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
264
+ float amax = 0.0f; // absolute max
265
+
266
+ for (int j = 0; j < QK8_0; j++) {
267
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
268
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
269
+ }
270
+
271
+ const float d = amax / ((1 << 7) - 1);
272
+ id[row_iter] = d ? 1.0f / d : 0.0f;
273
+
274
+ y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
275
+ }
276
+
277
+ for (int j = 0; j < QK8_0 * 4; j++) {
278
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
279
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
280
+ src_offset += (j % blck_size_interleave);
281
+
282
+ float x0 = srcv[src_id][src_offset] * id[src_id];
283
+ y[i].qs[j] = roundf(x0);
284
+ }
285
+ }
286
+ #endif
287
+ }
288
+
289
+ void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
290
+ assert(nrow == 4);
291
+ UNUSED(nrow);
292
+ if (blck_size_interleave == 4) {
293
+ quantize_q8_0_4x4(x, vy, n_per_row);
294
+ } else if (blck_size_interleave == 8) {
295
+ quantize_q8_0_4x8(x, vy, n_per_row);
296
+ } else {
297
+ assert(false);
298
+ }
299
+ }
300
+
301
+ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
302
+ assert(n_per_row % QK4_0 == 0);
303
+ const int nb = n_per_row / QK4_0;
304
+
305
+ void * out_ptr = NULL;
306
+ if (nrows_interleaved == 8) {
307
+ out_ptr = (block_q4_0x8 *) dst;
308
+ }
309
+ else if (nrows_interleaved == 4) {
310
+ out_ptr = (block_q4_0x4 *) dst;
311
+ }
312
+ assert(nrows_interleaved <= 8);
313
+ block_q4_0 dst_tmp[8];
314
+
315
+ for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
316
+
317
+ for (int64_t x = 0; x < nb; x++) {
318
+
319
+ for (int i = 0; i < nrows_interleaved; i++ ) {
320
+ quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
321
+ }
322
+
323
+ if (nrows_interleaved == 8) {
324
+ *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave, 0x88);
325
+ out_ptr = (block_q4_0x8 *) out_ptr + 1;
326
+ }
327
+ else if (nrows_interleaved == 4) {
328
+ *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x88);
329
+ out_ptr = (block_q4_0x4 *) out_ptr + 1;
330
+ }
331
+ }
332
+ }
333
+
334
+ return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
335
+ }
336
+
337
+ size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
338
+ if (!quant_weights) {
339
+ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
340
+ }
341
+ else {
342
+ assert(false);
343
+ return 0;
344
+ }
345
+ }
346
+
347
+ size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
348
+ if (!quant_weights) {
349
+ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
350
+ }
351
+ else {
352
+ assert(false);
353
+ return 0;
354
+ }
355
+ }
356
+
357
+ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
358
+ if (!quant_weights) {
359
+ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
360
+ }
361
+ else {
362
+ assert(false);
363
+ return 0;
364
+ }
365
+ }
366
+
367
+ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
368
+ const int qk = QK8_0;
369
+ const int nb = n / qk;
370
+ const int ncols_interleaved = 4;
371
+ const int blocklen = 4;
372
+
373
+ assert (n % qk == 0);
374
+ assert (nc % ncols_interleaved == 0);
375
+
376
+ UNUSED(s);
377
+ UNUSED(bs);
378
+ UNUSED(vx);
379
+ UNUSED(vy);
380
+ UNUSED(nr);
381
+ UNUSED(nc);
382
+ UNUSED(nb);
383
+ UNUSED(ncols_interleaved);
384
+ UNUSED(blocklen);
385
+
386
+ #if defined(__ARM_FEATURE_SVE)
387
+ if (svcntw() == 8) {
388
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
389
+ "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
390
+ }
391
+ #endif
392
+ #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
393
+ GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
394
+ "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
395
+ #elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
396
+ const void * b_ptr = vx;
397
+ const void * a_ptr = vy;
398
+ float * res_ptr = s;
399
+
400
+ __asm__ __volatile__(
401
+ "movi v31.16b, #0x4\n"
402
+ "movi v30.16b, #0xf0\n"
403
+ "add %x[b_ptr], %x[b_ptr], #0x8\n"
404
+ "1:" // Column loop
405
+ "add x22, %x[a_ptr], #0x2\n"
406
+ "movi v29.16b, #0x0\n"
407
+ "mov x21, %x[nb]\n"
408
+ "2:" // Block loop
409
+ "ldr q28, [%x[b_ptr], #0x0]\n"
410
+ "ldr q27, [x22, #0x0]\n"
411
+ "movi v26.4s, #0x0\n"
412
+ "sub x20, x22, #0x2\n"
413
+ "ldr q25, [x22, #0x10]\n"
414
+ "ldr q24, [%x[b_ptr], #0x10]\n"
415
+ "sub x21, x21, #0x1\n"
416
+ "add x22, x22, #0x22\n"
417
+ "ldr q23, [%x[b_ptr], #0x20]\n"
418
+ "ldr q22, [%x[b_ptr], #0x30]\n"
419
+ "ld1r { v21.8h }, [x20]\n"
420
+ "ldr q20, [%x[b_ptr], #-0x8]\n"
421
+ "sshl v16.16b, v28.16b, v31.16b\n"
422
+ "and v28.16b, v28.16b, v30.16b\n"
423
+ "sshl v19.16b, v24.16b, v31.16b\n"
424
+ "and v24.16b, v24.16b, v30.16b\n"
425
+ "add %x[b_ptr], %x[b_ptr], #0x48\n"
426
+ "sshl v18.16b, v23.16b, v31.16b\n"
427
+ "and v23.16b, v23.16b, v30.16b\n"
428
+ ".inst 0x4f9be21a // sdot v26.4s, v16.16b, v27.4b[0]\n"
429
+ "sshl v17.16b, v22.16b, v31.16b\n"
430
+ "and v22.16b, v22.16b, v30.16b\n"
431
+ "fcvtl v21.4s, v21.4h\n"
432
+ "fcvtl v16.4s, v20.4h\n"
433
+ ".inst 0x4f99e39a // sdot v26.4s, v28.16b, v25.4b[0]\n"
434
+ "fmul v16.4s, v16.4s, v21.4s\n"
435
+ ".inst 0x4fbbe27a // sdot v26.4s, v19.16b, v27.4b[1]\n"
436
+ ".inst 0x4fb9e31a // sdot v26.4s, v24.16b, v25.4b[1]\n"
437
+ ".inst 0x4f9bea5a // sdot v26.4s, v18.16b, v27.4b[2]\n"
438
+ ".inst 0x4f99eafa // sdot v26.4s, v23.16b, v25.4b[2]\n"
439
+ ".inst 0x4fbbea3a // sdot v26.4s, v17.16b, v27.4b[3]\n"
440
+ ".inst 0x4fb9eada // sdot v26.4s, v22.16b, v25.4b[3]\n"
441
+ "scvtf v26.4s, v26.4s, #0x4\n"
442
+ "fmla v29.4s, v26.4s, v16.4s\n"
443
+ "cbnz x21, 2b\n"
444
+ "sub %x[nc], %x[nc], #0x4\n"
445
+ "str q29, [%x[res_ptr], #0x0]\n"
446
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
447
+ "cbnz %x[nc], 1b\n"
448
+ : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
449
+ : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
450
+ : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
451
+ );
452
+ #else
453
+ float sumf[4];
454
+ int sumi;
455
+
456
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
457
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
458
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
459
+
460
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
461
+ for (int l = 0; l < nb; l++) {
462
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
463
+ for (int j = 0; j < ncols_interleaved; j++) {
464
+ sumi = 0;
465
+ for (int i = 0; i < blocklen; ++i) {
466
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
467
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
468
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
469
+ }
470
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
471
+ }
472
+ }
473
+ }
474
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
475
+ }
476
+ #endif
477
+ }
478
+
479
+ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
480
+ const int qk = QK8_0;
481
+ const int nb = n / qk;
482
+ const int ncols_interleaved = 4;
483
+ const int blocklen = 8;
484
+
485
+ assert (n % qk == 0);
486
+ assert (nc % ncols_interleaved == 0);
487
+
488
+ UNUSED(s);
489
+ UNUSED(bs);
490
+ UNUSED(vx);
491
+ UNUSED(vy);
492
+ UNUSED(nr);
493
+ UNUSED(nc);
494
+ UNUSED(nb);
495
+ UNUSED(ncols_interleaved);
496
+ UNUSED(blocklen);
497
+
498
+ #if defined(__ARM_FEATURE_SVE)
499
+ if (svcntw() == 8) {
500
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
501
+ "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
502
+ }
503
+ #endif
504
+ #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
505
+ const void * b_ptr = vx;
506
+ const void * a_ptr = vy;
507
+ float * res_ptr = s;
508
+
509
+ __asm__ __volatile__(
510
+ "movi v2.16b, #0x4\n"
511
+ "movi v1.16b, #0xf0\n"
512
+ "add %x[b_ptr], %x[b_ptr], #0x8\n"
513
+ "1:" // Column loop
514
+ "add x23, %x[a_ptr], #0x2\n"
515
+ "movi v0.16b, #0x0\n"
516
+ "mov x22, %x[nb]\n"
517
+ "2:" // Block loop
518
+ "ldr q31, [%x[b_ptr], #0x0]\n"
519
+ "ldr q30, [%x[b_ptr], #0x10]\n"
520
+ "mov x21, x23\n"
521
+ "movi v29.4s, #0x0\n"
522
+ "ldr q28, [%x[b_ptr], #0x20]\n"
523
+ "ldr q27, [%x[b_ptr], #0x30]\n"
524
+ "movi v26.4s, #0x0\n"
525
+ "sub x20, x23, #0x2\n"
526
+ "ld1r { v25.8h }, [x20]\n"
527
+ "ldr q24, [%x[b_ptr], #-0x8]\n"
528
+ "sub x22, x22, #0x1\n"
529
+ "add x23, x23, #0x22\n"
530
+ "ld1r { v23.2d }, [x21], #0x8\n"
531
+ "sshl v22.16b, v31.16b, v2.16b\n"
532
+ "sshl v16.16b, v30.16b, v2.16b\n"
533
+ "add %x[b_ptr], %x[b_ptr], #0x48\n"
534
+ "ld1r { v21.2d }, [x21], #0x8\n"
535
+ "sshl v20.16b, v28.16b, v2.16b\n"
536
+ "sshl v19.16b, v27.16b, v2.16b\n"
537
+ "ld1r { v18.2d }, [x21], #0x8\n"
538
+ "ld1r { v17.2d }, [x21], #0x8\n"
539
+ "and v31.16b, v31.16b, v1.16b\n"
540
+ "and v30.16b, v30.16b, v1.16b\n"
541
+ ".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n"
542
+ ".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n"
543
+ "and v28.16b, v28.16b, v1.16b\n"
544
+ "and v27.16b, v27.16b, v1.16b\n"
545
+ "fcvtl v25.4s, v25.4h\n"
546
+ "fcvtl v16.4s, v24.4h\n"
547
+ ".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n"
548
+ ".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
549
+ "fmul v16.4s, v16.4s, v25.4s\n"
550
+ ".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
551
+ ".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
552
+ ".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
553
+ ".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
554
+ "addp v29.4s, v29.4s, v26.4s\n"
555
+ "scvtf v29.4s, v29.4s, #0x4\n"
556
+ "fmla v0.4s, v29.4s, v16.4s\n"
557
+ "cbnz x22, 2b\n"
558
+ "sub %x[nc], %x[nc], #0x4\n"
559
+ "str q0, [%x[res_ptr], #0x0]\n"
560
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
561
+ "cbnz %x[nc], 1b\n"
562
+ : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
563
+ : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
564
+ : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
565
+ );
566
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
567
+ GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
568
+ "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
569
+ "performance");
570
+ #else
571
+ float sumf[4];
572
+ int sumi;
573
+
574
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
575
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
576
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
577
+
578
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
579
+ for (int l = 0; l < nb; l++) {
580
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
581
+ for (int j = 0; j < ncols_interleaved; j++) {
582
+ sumi = 0;
583
+ for (int i = 0; i < blocklen; ++i) {
584
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
585
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
586
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
587
+ }
588
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
589
+ }
590
+ }
591
+ }
592
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
593
+ }
594
+ #endif
595
+ }
596
+
597
+ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
598
+ const int qk = QK8_0;
599
+ const int nb = n / qk;
600
+ const int ncols_interleaved = 8;
601
+ const int blocklen = 8;
602
+
603
+ assert (n % qk == 0);
604
+ assert (nc % ncols_interleaved == 0);
605
+
606
+ UNUSED(s);
607
+ UNUSED(bs);
608
+ UNUSED(vx);
609
+ UNUSED(vy);
610
+ UNUSED(nr);
611
+ UNUSED(nc);
612
+ UNUSED(nb);
613
+ UNUSED(ncols_interleaved);
614
+ UNUSED(blocklen);
615
+
616
+ #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
617
+ if (svcntw() == 8) {
618
+ const void * b_ptr = vx;
619
+ const void * a_ptr = vy;
620
+ float * res_ptr = s;
621
+
622
+ __asm__ __volatile__(
623
+ "ptrue p0.b\n"
624
+ "add %x[b_ptr], %x[b_ptr], #0x10\n"
625
+ "1:" // Column loop
626
+ "add x22, %x[a_ptr], #0x2\n"
627
+ "mov z31.b, #0x0\n"
628
+ "mov x21, %x[nb]\n"
629
+ "2:" // Block loop
630
+ "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
631
+ "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
632
+ "mov z28.s, #0x0\n"
633
+ "mov z27.s, #0x0\n"
634
+ "ld1rd { z26.d }, p0/Z, [x22]\n"
635
+ "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
636
+ "sub x20, x22, #0x2\n"
637
+ "sub x21, x21, #0x1\n"
638
+ "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
639
+ "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
640
+ "lsl z22.b, z30.b, #0x4\n"
641
+ "lsl z16.b, z29.b, #0x4\n"
642
+ "and z30.b, z30.b, #0xf0\n"
643
+ "and z29.b, z29.b, #0xf0\n"
644
+ "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
645
+ "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
646
+ "lsl z19.b, z25.b, #0x4\n"
647
+ "and z25.b, z25.b, #0xf0\n"
648
+ "ld1rh { z17.h }, p0/Z, [x20]\n"
649
+ "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
650
+ "sdot z28.s, z22.b, z26.b\n"
651
+ "sdot z27.s, z16.b, z26.b\n"
652
+ "lsl z16.b, z24.b, #0x4\n"
653
+ "add x22, x22, #0x22\n"
654
+ "and z24.b, z24.b, #0xf0\n"
655
+ "add %x[b_ptr], %x[b_ptr], #0x90\n"
656
+ "fcvt z17.s, p0/m, z17.h\n"
657
+ "fcvt z18.s, p0/m, z18.h\n"
658
+ "sdot z28.s, z19.b, z23.b\n"
659
+ "sdot z27.s, z16.b, z23.b\n"
660
+ "fmul z18.s, z18.s, z17.s\n"
661
+ "sdot z28.s, z30.b, z21.b\n"
662
+ "sdot z27.s, z29.b, z21.b\n"
663
+ "sdot z28.s, z25.b, z20.b\n"
664
+ "sdot z27.s, z24.b, z20.b\n"
665
+ "uzp1 z17.s, z28.s, z27.s\n"
666
+ "uzp2 z16.s, z28.s, z27.s\n"
667
+ "add z17.s, z17.s, z16.s\n"
668
+ "asr z17.s, z17.s, #0x4\n"
669
+ "scvtf z17.s, p0/m, z17.s\n"
670
+ "fmla z31.s, p0/M, z17.s, z18.s\n"
671
+ "cbnz x21, 2b\n"
672
+ "sub %x[nc], %x[nc], #0x8\n"
673
+ "st1w { z31.s }, p0, [%x[res_ptr]]\n"
674
+ "add %x[res_ptr], %x[res_ptr], #0x20\n"
675
+ "cbnz %x[nc], 1b\n"
676
+ : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
677
+ : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
678
+ : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
679
+ );
680
+ return;
681
+ }
682
+ else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
683
+ GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
684
+ "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
685
+ "performance");
686
+ }
687
+ else if (ggml_cpu_has_neon()) {
688
+ GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
689
+ "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
690
+ "quantization format for optimal performance");
691
+ }
692
+ #endif
693
+ #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
694
+ GGML_ASSERT(ggml_cpu_has_sve() &&
695
+ "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
696
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
697
+ GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
698
+ "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
699
+ "performance");
700
+ #else
701
+ float sumf[8];
702
+ int sumi;
703
+
704
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
705
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
706
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
707
+
708
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
709
+ for (int l = 0; l < nb; l++) {
710
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
711
+ for (int j = 0; j < ncols_interleaved; j++) {
712
+ sumi = 0;
713
+ for (int i = 0; i < blocklen; ++i) {
714
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
715
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
716
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
717
+ }
718
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
719
+ }
720
+ }
721
+ }
722
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
723
+ }
724
+ #endif
725
+ }
726
+
727
+ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
728
+ const int qk = QK8_0;
729
+ const int nb = n / qk;
730
+ const int ncols_interleaved = 4;
731
+ const int blocklen = 4;
732
+
733
+ assert (n % qk == 0);
734
+ assert (nr % 4 == 0);
735
+ assert (nc % ncols_interleaved == 0);
736
+
737
+ UNUSED(s);
738
+ UNUSED(bs);
739
+ UNUSED(vx);
740
+ UNUSED(vy);
741
+ UNUSED(nr);
742
+ UNUSED(nc);
743
+ UNUSED(nb);
744
+ UNUSED(ncols_interleaved);
745
+ UNUSED(blocklen);
746
+
747
+ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
748
+ if (svcntw() == 8) {
749
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
750
+ "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
751
+ }
752
+ #endif
753
+ #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
754
+ GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
755
+ "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
756
+ #elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
757
+ const void * b_ptr = vx;
758
+ const void * a_ptr = vy;
759
+ float * res_ptr = s;
760
+ size_t res_stride = bs * sizeof(float);
761
+
762
+ __asm__ __volatile__(
763
+ "mov x10, %x[nr]\n"
764
+ "mov x9, #0x88\n"
765
+ "cmp x10, #0x10\n"
766
+ "mul x9, %x[nb], x9\n"
767
+ "blt 4f\n"
768
+ "1:" // Row loop
769
+ "add x28, %x[b_ptr], #0x8\n"
770
+ "mov x27, %x[nc]\n"
771
+ "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
772
+ "2:" // Column loop
773
+ "add x25, %x[a_ptr], #0x8\n"
774
+ "movi v15.16b, #0x0\n"
775
+ "movi v19.16b, #0x0\n"
776
+ "mov x24, %x[nb]\n"
777
+ "add x23, x25, x9\n"
778
+ "movi v18.16b, #0x0\n"
779
+ "movi v14.16b, #0x0\n"
780
+ "add x22, x23, x9\n"
781
+ "movi v11.16b, #0x0\n"
782
+ "movi v13.16b, #0x0\n"
783
+ "add x21, x22, x9\n"
784
+ "movi v23.16b, #0x0\n"
785
+ "movi v16.16b, #0x0\n"
786
+ "movi v25.16b, #0x0\n"
787
+ "movi v7.16b, #0x0\n"
788
+ "movi v0.16b, #0x0\n"
789
+ "movi v4.16b, #0x0\n"
790
+ "movi v5.16b, #0x0\n"
791
+ "movi v21.16b, #0x0\n"
792
+ "movi v8.16b, #0x0\n"
793
+ "movi v1.16b, #0x0\n"
794
+ "3:" // Block loop
795
+ "ldr q3, [x28, #0x0]\n"
796
+ "ldr q31, [x25, #0x0]\n"
797
+ "movi v28.16b, #0x4\n"
798
+ "movi v10.4s, #0x0\n"
799
+ "ldr q22, [x28, #0x10]\n"
800
+ "ldr q6, [x25, #0x10]\n"
801
+ "movi v29.4s, #0x0\n"
802
+ "movi v9.4s, #0x0\n"
803
+ "ldr q27, [x28, #0x20]\n"
804
+ "ldr q30, [x28, #0x30]\n"
805
+ "movi v20.4s, #0x0\n"
806
+ "movi v24.16b, #0xf0\n"
807
+ "ldr d2, [x25, #-0x8]\n"
808
+ "ldr d26, [x23, #-0x8]\n"
809
+ "sshl v12.16b, v3.16b, v28.16b\n"
810
+ "sub x20, x28, #0x8\n"
811
+ "ldr d17, [x20, #0x0]\n"
812
+ "and v3.16b, v3.16b, v24.16b\n"
813
+ "subs x24, x24, #0x1\n"
814
+ "add x28, x28, #0x48\n"
815
+ ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
816
+ ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
817
+ ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
818
+ ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
819
+ "sshl v31.16b, v22.16b, v28.16b\n"
820
+ "and v22.16b, v22.16b, v24.16b\n"
821
+ "fcvtl v17.4s, v17.4h\n"
822
+ "fcvtl v2.4s, v2.4h\n"
823
+ "fcvtl v26.4s, v26.4h\n"
824
+ ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
825
+ ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
826
+ ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
827
+ ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
828
+ "sshl v6.16b, v27.16b, v28.16b\n"
829
+ "sshl v28.16b, v30.16b, v28.16b\n"
830
+ "and v27.16b, v27.16b, v24.16b\n"
831
+ "and v30.16b, v30.16b, v24.16b\n"
832
+ "ldr q24, [x25, #0x20]\n"
833
+ ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
834
+ ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
835
+ ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
836
+ ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
837
+ "ldr q24, [x25, #0x30]\n"
838
+ ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
839
+ ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
840
+ ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
841
+ ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
842
+ "ldr q24, [x25, #0x40]\n"
843
+ ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
844
+ ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
845
+ ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
846
+ ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
847
+ "ldr q24, [x25, #0x50]\n"
848
+ ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
849
+ ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
850
+ ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
851
+ ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
852
+ "ldr q24, [x25, #0x60]\n"
853
+ ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
854
+ ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
855
+ ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
856
+ ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
857
+ "ldr q24, [x25, #0x70]\n"
858
+ "add x25, x25, #0x88\n"
859
+ ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
860
+ ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
861
+ ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
862
+ ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
863
+ "fmul v24.4s, v17.4s, v2.s[0]\n"
864
+ "scvtf v10.4s, v10.4s, #0x4\n"
865
+ "scvtf v29.4s, v29.4s, #0x4\n"
866
+ "scvtf v9.4s, v9.4s, #0x4\n"
867
+ "scvtf v20.4s, v20.4s, #0x4\n"
868
+ "fmla v15.4s, v10.4s, v24.4s\n"
869
+ "ldr q24, [x23, #0x0]\n"
870
+ "fmul v10.4s, v17.4s, v2.s[1]\n"
871
+ "fmla v19.4s, v29.4s, v10.4s\n"
872
+ "ldr q10, [x23, #0x10]\n"
873
+ "fmul v29.4s, v17.4s, v2.s[2]\n"
874
+ "fmul v2.4s, v17.4s, v2.s[3]\n"
875
+ "fmla v18.4s, v9.4s, v29.4s\n"
876
+ "movi v9.4s, #0x0\n"
877
+ "movi v29.4s, #0x0\n"
878
+ ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
879
+ ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
880
+ "fmla v14.4s, v20.4s, v2.4s\n"
881
+ "movi v20.4s, #0x0\n"
882
+ "movi v2.4s, #0x0\n"
883
+ ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
884
+ ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
885
+ "ldr q24, [x23, #0x20]\n"
886
+ ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
887
+ ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
888
+ ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
889
+ ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
890
+ "ldr q10, [x23, #0x30]\n"
891
+ ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
892
+ ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
893
+ ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
894
+ ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
895
+ "ldr q24, [x23, #0x40]\n"
896
+ ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
897
+ ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
898
+ ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
899
+ ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
900
+ "ldr q10, [x23, #0x50]\n"
901
+ ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
902
+ ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
903
+ ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
904
+ ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
905
+ "ldr q24, [x23, #0x60]\n"
906
+ ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
907
+ ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
908
+ ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
909
+ ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
910
+ "ldr q10, [x23, #0x70]\n"
911
+ "add x23, x23, #0x88\n"
912
+ ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
913
+ ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
914
+ ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
915
+ ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
916
+ "ldr q24, [x22, #0x0]\n"
917
+ ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
918
+ ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
919
+ ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
920
+ ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
921
+ "fmul v10.4s, v17.4s, v26.s[0]\n"
922
+ "scvtf v9.4s, v9.4s, #0x4\n"
923
+ "scvtf v29.4s, v29.4s, #0x4\n"
924
+ "scvtf v20.4s, v20.4s, #0x4\n"
925
+ "scvtf v2.4s, v2.4s, #0x4\n"
926
+ "fmla v11.4s, v9.4s, v10.4s\n"
927
+ "ldr q9, [x22, #0x10]\n"
928
+ "fmul v10.4s, v17.4s, v26.s[1]\n"
929
+ "fmla v13.4s, v29.4s, v10.4s\n"
930
+ "ldr d29, [x22, #-0x8]\n"
931
+ "fmul v10.4s, v17.4s, v26.s[2]\n"
932
+ "fmul v26.4s, v17.4s, v26.s[3]\n"
933
+ "fcvtl v29.4s, v29.4h\n"
934
+ "fmla v23.4s, v20.4s, v10.4s\n"
935
+ "movi v20.4s, #0x0\n"
936
+ "movi v10.4s, #0x0\n"
937
+ "fmla v16.4s, v2.4s, v26.4s\n"
938
+ "movi v26.4s, #0x0\n"
939
+ "movi v2.4s, #0x0\n"
940
+ ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
941
+ ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
942
+ ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
943
+ ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
944
+ "ldr q24, [x22, #0x20]\n"
945
+ ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
946
+ ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
947
+ ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
948
+ ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
949
+ "ldr q9, [x22, #0x30]\n"
950
+ ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
951
+ ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
952
+ ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
953
+ ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
954
+ "ldr q24, [x22, #0x40]\n"
955
+ ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
956
+ ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
957
+ ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
958
+ ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
959
+ "ldr q9, [x22, #0x50]\n"
960
+ ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
961
+ ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
962
+ ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
963
+ ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
964
+ "ldr q24, [x22, #0x60]\n"
965
+ ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
966
+ ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
967
+ ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
968
+ ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
969
+ "ldr q9, [x22, #0x70]\n"
970
+ "add x22, x22, #0x88\n"
971
+ ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
972
+ ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
973
+ ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
974
+ ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
975
+ "ldr q24, [x21, #0x0]\n"
976
+ ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
977
+ ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
978
+ ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
979
+ ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
980
+ "fmul v9.4s, v17.4s, v29.s[0]\n"
981
+ "scvtf v20.4s, v20.4s, #0x4\n"
982
+ "scvtf v10.4s, v10.4s, #0x4\n"
983
+ "scvtf v26.4s, v26.4s, #0x4\n"
984
+ "scvtf v2.4s, v2.4s, #0x4\n"
985
+ "fmla v25.4s, v20.4s, v9.4s\n"
986
+ "ldr q9, [x21, #0x10]\n"
987
+ "fmul v20.4s, v17.4s, v29.s[1]\n"
988
+ "fmla v7.4s, v10.4s, v20.4s\n"
989
+ "ldr d20, [x21, #-0x8]\n"
990
+ "fmul v10.4s, v17.4s, v29.s[2]\n"
991
+ "fmul v29.4s, v17.4s, v29.s[3]\n"
992
+ "fcvtl v20.4s, v20.4h\n"
993
+ "fmla v0.4s, v26.4s, v10.4s\n"
994
+ "movi v26.4s, #0x0\n"
995
+ "movi v10.4s, #0x0\n"
996
+ "fmla v4.4s, v2.4s, v29.4s\n"
997
+ "movi v2.4s, #0x0\n"
998
+ "movi v29.4s, #0x0\n"
999
+ ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
1000
+ ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
1001
+ ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
1002
+ ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
1003
+ "ldr q12, [x21, #0x20]\n"
1004
+ "fmul v24.4s, v17.4s, v20.s[0]\n"
1005
+ ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
1006
+ ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
1007
+ ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
1008
+ ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
1009
+ "ldr q9, [x21, #0x30]\n"
1010
+ "fmul v31.4s, v17.4s, v20.s[1]\n"
1011
+ ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
1012
+ ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
1013
+ ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
1014
+ ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
1015
+ "ldr q12, [x21, #0x40]\n"
1016
+ "fmul v6.4s, v17.4s, v20.s[2]\n"
1017
+ "fmul v20.4s, v17.4s, v20.s[3]\n"
1018
+ ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
1019
+ ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
1020
+ ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
1021
+ ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
1022
+ "ldr q9, [x21, #0x50]\n"
1023
+ ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
1024
+ ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
1025
+ ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
1026
+ ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
1027
+ "ldr q12, [x21, #0x60]\n"
1028
+ ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
1029
+ ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
1030
+ ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
1031
+ ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
1032
+ "ldr q17, [x21, #0x70]\n"
1033
+ "add x21, x21, #0x88\n"
1034
+ ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
1035
+ ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
1036
+ ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
1037
+ ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
1038
+ ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
1039
+ ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
1040
+ ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
1041
+ ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
1042
+ "scvtf v26.4s, v26.4s, #0x4\n"
1043
+ "scvtf v10.4s, v10.4s, #0x4\n"
1044
+ "fmla v5.4s, v26.4s, v24.4s\n"
1045
+ "scvtf v2.4s, v2.4s, #0x4\n"
1046
+ "scvtf v29.4s, v29.4s, #0x4\n"
1047
+ "fmla v21.4s, v10.4s, v31.4s\n"
1048
+ "fmla v8.4s, v2.4s, v6.4s\n"
1049
+ "fmla v1.4s, v29.4s, v20.4s\n"
1050
+ "bgt 3b\n"
1051
+ "mov x20, %x[res_ptr]\n"
1052
+ "subs x27, x27, #0x4\n"
1053
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1054
+ "str q15, [x20, #0x0]\n"
1055
+ "add x20, x20, %x[res_stride]\n"
1056
+ "str q19, [x20, #0x0]\n"
1057
+ "add x20, x20, %x[res_stride]\n"
1058
+ "str q18, [x20, #0x0]\n"
1059
+ "add x20, x20, %x[res_stride]\n"
1060
+ "str q14, [x20, #0x0]\n"
1061
+ "add x20, x20, %x[res_stride]\n"
1062
+ "str q11, [x20, #0x0]\n"
1063
+ "add x20, x20, %x[res_stride]\n"
1064
+ "str q13, [x20, #0x0]\n"
1065
+ "add x20, x20, %x[res_stride]\n"
1066
+ "str q23, [x20, #0x0]\n"
1067
+ "add x20, x20, %x[res_stride]\n"
1068
+ "str q16, [x20, #0x0]\n"
1069
+ "add x20, x20, %x[res_stride]\n"
1070
+ "str q25, [x20, #0x0]\n"
1071
+ "add x20, x20, %x[res_stride]\n"
1072
+ "str q7, [x20, #0x0]\n"
1073
+ "add x20, x20, %x[res_stride]\n"
1074
+ "str q0, [x20, #0x0]\n"
1075
+ "add x20, x20, %x[res_stride]\n"
1076
+ "str q4, [x20, #0x0]\n"
1077
+ "add x20, x20, %x[res_stride]\n"
1078
+ "str q5, [x20, #0x0]\n"
1079
+ "add x20, x20, %x[res_stride]\n"
1080
+ "str q21, [x20, #0x0]\n"
1081
+ "add x20, x20, %x[res_stride]\n"
1082
+ "str q8, [x20, #0x0]\n"
1083
+ "add x20, x20, %x[res_stride]\n"
1084
+ "str q1, [x20, #0x0]\n"
1085
+ "bne 2b\n"
1086
+ "mov x20, #0x4\n"
1087
+ "sub x10, x10, #0x10\n"
1088
+ "cmp x10, #0x10\n"
1089
+ "mov %x[res_ptr], x26\n"
1090
+ "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1091
+ "bge 1b\n"
1092
+ "4:" // Row loop skip
1093
+ "cbz x10, 9f\n"
1094
+ "5:" // Row tail: Row loop
1095
+ "add x24, %x[b_ptr], #0x8\n"
1096
+ "mov x23, %x[nc]\n"
1097
+ "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1098
+ "6:" // Row tail: Column loop
1099
+ "movi v15.16b, #0x0\n"
1100
+ "movi v19.16b, #0x0\n"
1101
+ "add x25, %x[a_ptr], #0x8\n"
1102
+ "mov x21, %x[nb]\n"
1103
+ "movi v18.16b, #0x0\n"
1104
+ "movi v14.16b, #0x0\n"
1105
+ "7:" // Row tail: Block loop
1106
+ "ldr q7, [x24, #0x0]\n"
1107
+ "ldr q5, [x25, #0x0]\n"
1108
+ "movi v9.16b, #0x4\n"
1109
+ "movi v4.4s, #0x0\n"
1110
+ "ldr q3, [x24, #0x10]\n"
1111
+ "ldr q2, [x25, #0x10]\n"
1112
+ "movi v1.4s, #0x0\n"
1113
+ "movi v0.4s, #0x0\n"
1114
+ "ldr q13, [x24, #0x20]\n"
1115
+ "ldr q31, [x25, #0x20]\n"
1116
+ "movi v30.4s, #0x0\n"
1117
+ "movi v29.16b, #0xf0\n"
1118
+ "ldr q28, [x24, #0x30]\n"
1119
+ "ldr q27, [x25, #0x30]\n"
1120
+ "sshl v20.16b, v7.16b, v9.16b\n"
1121
+ "sub x20, x24, #0x8\n"
1122
+ "ldr q26, [x25, #0x40]\n"
1123
+ "ldr q25, [x25, #0x50]\n"
1124
+ "sshl v17.16b, v3.16b, v9.16b\n"
1125
+ "and v7.16b, v7.16b, v29.16b\n"
1126
+ "ldr q24, [x25, #0x60]\n"
1127
+ "ldr q16, [x25, #0x70]\n"
1128
+ "sshl v22.16b, v13.16b, v9.16b\n"
1129
+ "and v3.16b, v3.16b, v29.16b\n"
1130
+ "ldr d21, [x20, #0x0]\n"
1131
+ "ldr d12, [x25, #-0x8]\n"
1132
+ ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
1133
+ ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
1134
+ ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
1135
+ ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
1136
+ "sshl v9.16b, v28.16b, v9.16b\n"
1137
+ "subs x21, x21, #0x1\n"
1138
+ "and v13.16b, v13.16b, v29.16b\n"
1139
+ "and v28.16b, v28.16b, v29.16b\n"
1140
+ "add x25, x25, #0x88\n"
1141
+ "add x24, x24, #0x48\n"
1142
+ "fcvtl v21.4s, v21.4h\n"
1143
+ "fcvtl v12.4s, v12.4h\n"
1144
+ ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
1145
+ ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
1146
+ ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
1147
+ ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
1148
+ "fmul v11.4s, v21.4s, v12.s[0]\n"
1149
+ "fmul v23.4s, v21.4s, v12.s[1]\n"
1150
+ "fmul v17.4s, v21.4s, v12.s[2]\n"
1151
+ ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
1152
+ "fmul v6.4s, v21.4s, v12.s[3]\n"
1153
+ ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
1154
+ ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
1155
+ ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
1156
+ ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
1157
+ ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
1158
+ ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
1159
+ ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
1160
+ ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
1161
+ ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
1162
+ ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
1163
+ ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
1164
+ ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
1165
+ ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
1166
+ ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
1167
+ ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
1168
+ ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
1169
+ ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
1170
+ ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
1171
+ ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
1172
+ ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
1173
+ ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
1174
+ ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
1175
+ ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
1176
+ "scvtf v4.4s, v4.4s, #0x4\n"
1177
+ "scvtf v1.4s, v1.4s, #0x4\n"
1178
+ "scvtf v0.4s, v0.4s, #0x4\n"
1179
+ "fmla v15.4s, v4.4s, v11.4s\n"
1180
+ "scvtf v30.4s, v30.4s, #0x4\n"
1181
+ "fmla v19.4s, v1.4s, v23.4s\n"
1182
+ "fmla v18.4s, v0.4s, v17.4s\n"
1183
+ "fmla v14.4s, v30.4s, v6.4s\n"
1184
+ "bgt 7b\n"
1185
+ "mov x20, %x[res_ptr]\n"
1186
+ "cmp x10, #0x1\n"
1187
+ "str q15, [x20, #0x0]\n"
1188
+ "add x20, x20, %x[res_stride]\n"
1189
+ "ble 8f\n"
1190
+ "cmp x10, #0x2\n"
1191
+ "str q19, [x20, #0x0]\n"
1192
+ "add x20, x20, %x[res_stride]\n"
1193
+ "ble 8f\n"
1194
+ "cmp x10, #0x3\n"
1195
+ "str q18, [x20, #0x0]\n"
1196
+ "add x20, x20, %x[res_stride]\n"
1197
+ "ble 8f\n"
1198
+ "str q14, [x20, #0x0]\n"
1199
+ "8:" // Row tail: Accumulator store skip
1200
+ "subs x23, x23, #0x4\n"
1201
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1202
+ "bne 6b\n"
1203
+ "subs x10, x10, #0x4\n"
1204
+ "add %x[a_ptr], %x[a_ptr], x9\n"
1205
+ "mov %x[res_ptr], x22\n"
1206
+ "bgt 5b\n"
1207
+ "9:" // Row tail: Row loop skip
1208
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1209
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1210
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1211
+ );
1212
+ #else
1213
+ float sumf[4][4];
1214
+ int sumi;
1215
+
1216
+ for (int y = 0; y < nr / 4; y++) {
1217
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1218
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1219
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1220
+ for (int m = 0; m < 4; m++) {
1221
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1222
+ }
1223
+ for (int l = 0; l < nb; l++) {
1224
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1225
+ for (int m = 0; m < 4; m++) {
1226
+ for (int j = 0; j < ncols_interleaved; j++) {
1227
+ sumi = 0;
1228
+ for (int i = 0; i < blocklen; ++i) {
1229
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1230
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1231
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1232
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1233
+ }
1234
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1235
+ }
1236
+ }
1237
+ }
1238
+ }
1239
+ for (int m = 0; m < 4; m++) {
1240
+ for (int j = 0; j < ncols_interleaved; j++)
1241
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1242
+ }
1243
+ }
1244
+ }
1245
+ #endif
1246
+ }
1247
+
1248
+ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
1249
+ const int qk = QK8_0;
1250
+ const int nb = n / qk;
1251
+ const int ncols_interleaved = 4;
1252
+ const int blocklen = 8;
1253
+
1254
+ assert (n % qk == 0);
1255
+ assert (nr % 4 == 0);
1256
+ assert (nc % ncols_interleaved == 0);
1257
+
1258
+ UNUSED(s);
1259
+ UNUSED(bs);
1260
+ UNUSED(vx);
1261
+ UNUSED(vy);
1262
+ UNUSED(nr);
1263
+ UNUSED(nc);
1264
+ UNUSED(nb);
1265
+ UNUSED(ncols_interleaved);
1266
+ UNUSED(blocklen);
1267
+
1268
+ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1269
+ if (svcntw() == 8) {
1270
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
1271
+ "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
1272
+ }
1273
+ #endif
1274
+ #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1275
+ const void * b_ptr = vx;
1276
+ const void * a_ptr = vy;
1277
+ float * res_ptr = s;
1278
+ size_t res_stride = bs * sizeof(float);
1279
+
1280
+ __asm__ __volatile__(
1281
+ "mov x10, %x[nr]\n"
1282
+ "mov x9, #0x88\n"
1283
+ "cmp x10, #0x10\n"
1284
+ "mul x9, %x[nb], x9\n"
1285
+ "blt 4f\n"
1286
+ "1:" // Row loop
1287
+ "add x28, %x[b_ptr], #0x8\n"
1288
+ "mov x27, %x[nc]\n"
1289
+ "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1290
+ "2:" // Column loop
1291
+ "add x25, %x[a_ptr], #0x8\n"
1292
+ "movi v2.16b, #0x0\n"
1293
+ "movi v10.16b, #0x0\n"
1294
+ "mov x24, %x[nb]\n"
1295
+ "add x23, x25, x9\n"
1296
+ "movi v12.16b, #0x0\n"
1297
+ "movi v28.16b, #0x0\n"
1298
+ "add x22, x23, x9\n"
1299
+ "movi v11.16b, #0x0\n"
1300
+ "movi v13.16b, #0x0\n"
1301
+ "add x21, x22, x9\n"
1302
+ "movi v22.16b, #0x0\n"
1303
+ "movi v23.16b, #0x0\n"
1304
+ "movi v25.16b, #0x0\n"
1305
+ "movi v5.16b, #0x0\n"
1306
+ "movi v7.16b, #0x0\n"
1307
+ "movi v4.16b, #0x0\n"
1308
+ "movi v6.16b, #0x0\n"
1309
+ "movi v30.16b, #0x0\n"
1310
+ "movi v24.16b, #0x0\n"
1311
+ "movi v14.16b, #0x0\n"
1312
+ "3:" // Block loop
1313
+ "ldr q21, [x28, #0x0]\n"
1314
+ "ldr q16, [x28, #0x10]\n"
1315
+ "movi v1.16b, #0x4\n"
1316
+ "movi v19.4s, #0x0\n"
1317
+ "ldr q27, [x25, #0x0]\n"
1318
+ "ldr q15, [x25, #0x10]\n"
1319
+ "movi v26.4s, #0x0\n"
1320
+ "movi v18.4s, #0x0\n"
1321
+ "ldr q29, [x28, #0x20]\n"
1322
+ "ldr q3, [x28, #0x30]\n"
1323
+ "movi v17.4s, #0x0\n"
1324
+ "movi v0.16b, #0xf0\n"
1325
+ "ldr d20, [x25, #-0x8]\n"
1326
+ "ldr d9, [x23, #-0x8]\n"
1327
+ "sshl v8.16b, v21.16b, v1.16b\n"
1328
+ "sshl v31.16b, v16.16b, v1.16b\n"
1329
+ "and v21.16b, v21.16b, v0.16b\n"
1330
+ "and v16.16b, v16.16b, v0.16b\n"
1331
+ "sub x20, x28, #0x8\n"
1332
+ "subs x24, x24, #0x1\n"
1333
+ "add x28, x28, #0x48\n"
1334
+ ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n"
1335
+ ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n"
1336
+ "ldr q27, [x25, #0x20]\n"
1337
+ ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n"
1338
+ ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n"
1339
+ "sshl v15.16b, v29.16b, v1.16b\n"
1340
+ "sshl v1.16b, v3.16b, v1.16b\n"
1341
+ "and v29.16b, v29.16b, v0.16b\n"
1342
+ "and v3.16b, v3.16b, v0.16b\n"
1343
+ "ldr q0, [x25, #0x30]\n"
1344
+ "fcvtl v20.4s, v20.4h\n"
1345
+ ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n"
1346
+ "fcvtl v9.4s, v9.4h\n"
1347
+ ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n"
1348
+ "ldr q27, [x25, #0x40]\n"
1349
+ ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n"
1350
+ ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
1351
+ "ldr q0, [x25, #0x50]\n"
1352
+ ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n"
1353
+ ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n"
1354
+ "ldr q27, [x25, #0x60]\n"
1355
+ ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n"
1356
+ ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n"
1357
+ "ldr q0, [x25, #0x70]\n"
1358
+ "add x25, x25, #0x88\n"
1359
+ ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n"
1360
+ ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n"
1361
+ "ldr d27, [x20, #0x0]\n"
1362
+ ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n"
1363
+ ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n"
1364
+ "fcvtl v27.4s, v27.4h\n"
1365
+ "uzp1 v0.2d, v19.2d, v26.2d\n"
1366
+ "uzp2 v26.2d, v19.2d, v26.2d\n"
1367
+ "fmul v19.4s, v27.4s, v20.s[0]\n"
1368
+ "scvtf v0.4s, v0.4s, #0x4\n"
1369
+ "scvtf v26.4s, v26.4s, #0x4\n"
1370
+ "fmla v2.4s, v0.4s, v19.4s\n"
1371
+ "ldr q19, [x23, #0x0]\n"
1372
+ "uzp1 v0.2d, v18.2d, v17.2d\n"
1373
+ "uzp2 v18.2d, v18.2d, v17.2d\n"
1374
+ "fmul v17.4s, v27.4s, v20.s[1]\n"
1375
+ "scvtf v0.4s, v0.4s, #0x4\n"
1376
+ "scvtf v18.4s, v18.4s, #0x4\n"
1377
+ "fmla v10.4s, v26.4s, v17.4s\n"
1378
+ "ldr q17, [x23, #0x10]\n"
1379
+ "fmul v26.4s, v27.4s, v20.s[2]\n"
1380
+ "fmul v20.4s, v27.4s, v20.s[3]\n"
1381
+ "fmla v12.4s, v0.4s, v26.4s\n"
1382
+ "ldr d0, [x22, #-0x8]\n"
1383
+ "ldr d26, [x21, #-0x8]\n"
1384
+ "fcvtl v0.4s, v0.4h\n"
1385
+ "fmla v28.4s, v18.4s, v20.4s\n"
1386
+ "movi v20.4s, #0x0\n"
1387
+ "movi v18.4s, #0x0\n"
1388
+ ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1389
+ ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1390
+ "ldr q19, [x23, #0x20]\n"
1391
+ "fcvtl v26.4s, v26.4h\n"
1392
+ ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1393
+ ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1394
+ "ldr q19, [x23, #0x40]\n"
1395
+ ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1396
+ ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1397
+ "ldr q19, [x23, #0x60]\n"
1398
+ ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n"
1399
+ ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n"
1400
+ "uzp1 v19.2d, v20.2d, v18.2d\n"
1401
+ "scvtf v19.4s, v19.4s, #0x4\n"
1402
+ "uzp2 v20.2d, v20.2d, v18.2d\n"
1403
+ "fmul v18.4s, v27.4s, v9.s[0]\n"
1404
+ "scvtf v20.4s, v20.4s, #0x4\n"
1405
+ "fmla v11.4s, v19.4s, v18.4s\n"
1406
+ "ldr q18, [x22, #0x0]\n"
1407
+ "fmul v19.4s, v27.4s, v9.s[1]\n"
1408
+ "fmla v13.4s, v20.4s, v19.4s\n"
1409
+ "movi v19.4s, #0x0\n"
1410
+ "movi v20.4s, #0x0\n"
1411
+ ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n"
1412
+ ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n"
1413
+ "ldr q17, [x23, #0x30]\n"
1414
+ ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n"
1415
+ ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n"
1416
+ "ldr q17, [x23, #0x50]\n"
1417
+ ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n"
1418
+ ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n"
1419
+ "ldr q17, [x23, #0x70]\n"
1420
+ "add x23, x23, #0x88\n"
1421
+ ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n"
1422
+ ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n"
1423
+ "uzp1 v17.2d, v19.2d, v20.2d\n"
1424
+ "scvtf v17.4s, v17.4s, #0x4\n"
1425
+ "uzp2 v20.2d, v19.2d, v20.2d\n"
1426
+ "fmul v19.4s, v27.4s, v9.s[2]\n"
1427
+ "fmul v9.4s, v27.4s, v9.s[3]\n"
1428
+ "scvtf v20.4s, v20.4s, #0x4\n"
1429
+ "fmla v22.4s, v17.4s, v19.4s\n"
1430
+ "ldr q17, [x22, #0x10]\n"
1431
+ "movi v19.4s, #0x0\n"
1432
+ ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n"
1433
+ "fmla v23.4s, v20.4s, v9.4s\n"
1434
+ "movi v20.4s, #0x0\n"
1435
+ "movi v9.4s, #0x0\n"
1436
+ ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n"
1437
+ "ldr q18, [x22, #0x20]\n"
1438
+ ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1439
+ ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n"
1440
+ ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n"
1441
+ "ldr q18, [x22, #0x40]\n"
1442
+ ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n"
1443
+ ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n"
1444
+ "ldr q18, [x22, #0x60]\n"
1445
+ ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n"
1446
+ ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n"
1447
+ "movi v18.4s, #0x0\n"
1448
+ ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n"
1449
+ "ldr q17, [x22, #0x30]\n"
1450
+ ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1451
+ ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n"
1452
+ "ldr q17, [x22, #0x50]\n"
1453
+ ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n"
1454
+ ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n"
1455
+ "ldr q17, [x22, #0x70]\n"
1456
+ "add x22, x22, #0x88\n"
1457
+ ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n"
1458
+ ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n"
1459
+ "uzp1 v17.2d, v19.2d, v20.2d\n"
1460
+ "uzp2 v20.2d, v19.2d, v20.2d\n"
1461
+ "fmul v19.4s, v27.4s, v0.s[0]\n"
1462
+ "scvtf v17.4s, v17.4s, #0x4\n"
1463
+ "scvtf v20.4s, v20.4s, #0x4\n"
1464
+ "fmla v25.4s, v17.4s, v19.4s\n"
1465
+ "ldr q19, [x21, #0x0]\n"
1466
+ "fmul v17.4s, v27.4s, v0.s[1]\n"
1467
+ "fmla v5.4s, v20.4s, v17.4s\n"
1468
+ "ldr q17, [x21, #0x10]\n"
1469
+ "uzp1 v20.2d, v9.2d, v18.2d\n"
1470
+ "uzp2 v9.2d, v9.2d, v18.2d\n"
1471
+ "fmul v18.4s, v27.4s, v0.s[2]\n"
1472
+ "fmul v0.4s, v27.4s, v0.s[3]\n"
1473
+ "scvtf v20.4s, v20.4s, #0x4\n"
1474
+ "scvtf v9.4s, v9.4s, #0x4\n"
1475
+ "fmla v7.4s, v20.4s, v18.4s\n"
1476
+ "movi v20.4s, #0x0\n"
1477
+ "movi v18.4s, #0x0\n"
1478
+ ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1479
+ ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1480
+ "ldr q19, [x21, #0x20]\n"
1481
+ "fmla v4.4s, v9.4s, v0.4s\n"
1482
+ "movi v9.4s, #0x0\n"
1483
+ "movi v0.4s, #0x0\n"
1484
+ ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1485
+ "fmul v8.4s, v27.4s, v26.s[0]\n"
1486
+ ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n"
1487
+ "ldr q17, [x21, #0x30]\n"
1488
+ ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1489
+ "fmul v31.4s, v27.4s, v26.s[1]\n"
1490
+ ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1491
+ "ldr q19, [x21, #0x40]\n"
1492
+ ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1493
+ "fmul v15.4s, v27.4s, v26.s[2]\n"
1494
+ "fmul v27.4s, v27.4s, v26.s[3]\n"
1495
+ ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n"
1496
+ "ldr q1, [x21, #0x50]\n"
1497
+ ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1498
+ ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1499
+ "ldr q26, [x21, #0x60]\n"
1500
+ ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n"
1501
+ ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n"
1502
+ "ldr q21, [x21, #0x70]\n"
1503
+ "add x21, x21, #0x88\n"
1504
+ ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n"
1505
+ ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n"
1506
+ ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n"
1507
+ ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n"
1508
+ "uzp1 v29.2d, v20.2d, v18.2d\n"
1509
+ "uzp2 v21.2d, v20.2d, v18.2d\n"
1510
+ "scvtf v29.4s, v29.4s, #0x4\n"
1511
+ "uzp1 v18.2d, v9.2d, v0.2d\n"
1512
+ "uzp2 v16.2d, v9.2d, v0.2d\n"
1513
+ "scvtf v21.4s, v21.4s, #0x4\n"
1514
+ "fmla v6.4s, v29.4s, v8.4s\n"
1515
+ "scvtf v18.4s, v18.4s, #0x4\n"
1516
+ "scvtf v16.4s, v16.4s, #0x4\n"
1517
+ "fmla v30.4s, v21.4s, v31.4s\n"
1518
+ "fmla v24.4s, v18.4s, v15.4s\n"
1519
+ "fmla v14.4s, v16.4s, v27.4s\n"
1520
+ "bgt 3b\n"
1521
+ "mov x20, %x[res_ptr]\n"
1522
+ "subs x27, x27, #0x4\n"
1523
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1524
+ "str q2, [x20, #0x0]\n"
1525
+ "add x20, x20, %x[res_stride]\n"
1526
+ "str q10, [x20, #0x0]\n"
1527
+ "add x20, x20, %x[res_stride]\n"
1528
+ "str q12, [x20, #0x0]\n"
1529
+ "add x20, x20, %x[res_stride]\n"
1530
+ "str q28, [x20, #0x0]\n"
1531
+ "add x20, x20, %x[res_stride]\n"
1532
+ "str q11, [x20, #0x0]\n"
1533
+ "add x20, x20, %x[res_stride]\n"
1534
+ "str q13, [x20, #0x0]\n"
1535
+ "add x20, x20, %x[res_stride]\n"
1536
+ "str q22, [x20, #0x0]\n"
1537
+ "add x20, x20, %x[res_stride]\n"
1538
+ "str q23, [x20, #0x0]\n"
1539
+ "add x20, x20, %x[res_stride]\n"
1540
+ "str q25, [x20, #0x0]\n"
1541
+ "add x20, x20, %x[res_stride]\n"
1542
+ "str q5, [x20, #0x0]\n"
1543
+ "add x20, x20, %x[res_stride]\n"
1544
+ "str q7, [x20, #0x0]\n"
1545
+ "add x20, x20, %x[res_stride]\n"
1546
+ "str q4, [x20, #0x0]\n"
1547
+ "add x20, x20, %x[res_stride]\n"
1548
+ "str q6, [x20, #0x0]\n"
1549
+ "add x20, x20, %x[res_stride]\n"
1550
+ "str q30, [x20, #0x0]\n"
1551
+ "add x20, x20, %x[res_stride]\n"
1552
+ "str q24, [x20, #0x0]\n"
1553
+ "add x20, x20, %x[res_stride]\n"
1554
+ "str q14, [x20, #0x0]\n"
1555
+ "bne 2b\n"
1556
+ "mov x20, #0x4\n"
1557
+ "sub x10, x10, #0x10\n"
1558
+ "cmp x10, #0x10\n"
1559
+ "mov %x[res_ptr], x26\n"
1560
+ "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1561
+ "bge 1b\n"
1562
+ "4:" // Row loop skip
1563
+ "cbz x10, 9f\n"
1564
+ "5:" // Row tail: Row loop
1565
+ "add x24, %x[b_ptr], #0x8\n"
1566
+ "mov x23, %x[nc]\n"
1567
+ "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1568
+ "6:" // Row tail: Column loop
1569
+ "movi v2.16b, #0x0\n"
1570
+ "movi v10.16b, #0x0\n"
1571
+ "add x25, %x[a_ptr], #0x8\n"
1572
+ "mov x21, %x[nb]\n"
1573
+ "movi v12.16b, #0x0\n"
1574
+ "movi v28.16b, #0x0\n"
1575
+ "7:" // Row tail: Block loop
1576
+ "ldr q6, [x24, #0x0]\n"
1577
+ "ldr q5, [x24, #0x10]\n"
1578
+ "movi v17.16b, #0x4\n"
1579
+ "movi v8.4s, #0x0\n"
1580
+ "ldr q4, [x25, #0x0]\n"
1581
+ "ldr q13, [x25, #0x10]\n"
1582
+ "movi v27.4s, #0x0\n"
1583
+ "movi v0.4s, #0x0\n"
1584
+ "ldr q31, [x24, #0x20]\n"
1585
+ "ldr q14, [x24, #0x30]\n"
1586
+ "movi v29.4s, #0x0\n"
1587
+ "movi v22.16b, #0xf0\n"
1588
+ "ldr q11, [x25, #0x20]\n"
1589
+ "ldr q23, [x25, #0x30]\n"
1590
+ "sshl v21.16b, v6.16b, v17.16b\n"
1591
+ "sshl v16.16b, v5.16b, v17.16b\n"
1592
+ "ldr q20, [x25, #0x40]\n"
1593
+ "ldr q26, [x25, #0x50]\n"
1594
+ "and v6.16b, v6.16b, v22.16b\n"
1595
+ "and v5.16b, v5.16b, v22.16b\n"
1596
+ "ldr q25, [x25, #0x60]\n"
1597
+ "ldr q3, [x25, #0x70]\n"
1598
+ "sshl v19.16b, v31.16b, v17.16b\n"
1599
+ "sshl v18.16b, v14.16b, v17.16b\n"
1600
+ "ldr d17, [x25, #-0x8]\n"
1601
+ ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n"
1602
+ ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n"
1603
+ "and v31.16b, v31.16b, v22.16b\n"
1604
+ ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n"
1605
+ ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n"
1606
+ "and v14.16b, v14.16b, v22.16b\n"
1607
+ "sub x20, x24, #0x8\n"
1608
+ "ldr d16, [x20, #0x0]\n"
1609
+ "subs x21, x21, #0x1\n"
1610
+ "add x25, x25, #0x88\n"
1611
+ "fcvtl v17.4s, v17.4h\n"
1612
+ "add x24, x24, #0x48\n"
1613
+ ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n"
1614
+ ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n"
1615
+ ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n"
1616
+ ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n"
1617
+ "fcvtl v16.4s, v16.4h\n"
1618
+ ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n"
1619
+ ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n"
1620
+ "fmul v23.4s, v16.4s, v17.s[0]\n"
1621
+ "fmul v21.4s, v16.4s, v17.s[1]\n"
1622
+ "fmul v1.4s, v16.4s, v17.s[2]\n"
1623
+ "fmul v20.4s, v16.4s, v17.s[3]\n"
1624
+ ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n"
1625
+ ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n"
1626
+ ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n"
1627
+ ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n"
1628
+ ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n"
1629
+ ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n"
1630
+ "uzp1 v19.2d, v8.2d, v27.2d\n"
1631
+ "uzp2 v18.2d, v8.2d, v27.2d\n"
1632
+ "scvtf v19.4s, v19.4s, #0x4\n"
1633
+ "uzp1 v17.2d, v0.2d, v29.2d\n"
1634
+ "uzp2 v16.2d, v0.2d, v29.2d\n"
1635
+ "scvtf v18.4s, v18.4s, #0x4\n"
1636
+ "fmla v2.4s, v19.4s, v23.4s\n"
1637
+ "scvtf v17.4s, v17.4s, #0x4\n"
1638
+ "scvtf v16.4s, v16.4s, #0x4\n"
1639
+ "fmla v10.4s, v18.4s, v21.4s\n"
1640
+ "fmla v12.4s, v17.4s, v1.4s\n"
1641
+ "fmla v28.4s, v16.4s, v20.4s\n"
1642
+ "bgt 7b\n"
1643
+ "mov x20, %x[res_ptr]\n"
1644
+ "cmp x10, #0x1\n"
1645
+ "str q2, [x20, #0x0]\n"
1646
+ "add x20, x20, %x[res_stride]\n"
1647
+ "ble 8f\n"
1648
+ "cmp x10, #0x2\n"
1649
+ "str q10, [x20, #0x0]\n"
1650
+ "add x20, x20, %x[res_stride]\n"
1651
+ "ble 8f\n"
1652
+ "cmp x10, #0x3\n"
1653
+ "str q12, [x20, #0x0]\n"
1654
+ "add x20, x20, %x[res_stride]\n"
1655
+ "ble 8f\n"
1656
+ "str q28, [x20, #0x0]\n"
1657
+ "8:" // Row tail: Accumulator store skip
1658
+ "subs x23, x23, #0x4\n"
1659
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1660
+ "bne 6b\n"
1661
+ "subs x10, x10, #0x4\n"
1662
+ "add %x[a_ptr], %x[a_ptr], x9\n"
1663
+ "mov %x[res_ptr], x22\n"
1664
+ "bgt 5b\n"
1665
+ "9:" // Row tail: Row loop skip
1666
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1667
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1668
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1669
+ );
1670
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
1671
+ GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
1672
+ "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
1673
+ "performance");
1674
+ #else
1675
+ float sumf[4][4];
1676
+ int sumi;
1677
+
1678
+ for (int y = 0; y < nr / 4; y++) {
1679
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1680
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1681
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1682
+ for (int m = 0; m < 4; m++) {
1683
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1684
+ }
1685
+ for (int l = 0; l < nb; l++) {
1686
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1687
+ for (int m = 0; m < 4; m++) {
1688
+ for (int j = 0; j < ncols_interleaved; j++) {
1689
+ sumi = 0;
1690
+ for (int i = 0; i < blocklen; ++i) {
1691
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1692
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1693
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1694
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1695
+ }
1696
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1697
+ }
1698
+ }
1699
+ }
1700
+ }
1701
+ for (int m = 0; m < 4; m++) {
1702
+ for (int j = 0; j < ncols_interleaved; j++)
1703
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1704
+ }
1705
+ }
1706
+ }
1707
+ #endif
1708
+ }
1709
+
1710
+ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
1711
+ const int qk = QK8_0;
1712
+ const int nb = n / qk;
1713
+ const int ncols_interleaved = 8;
1714
+ const int blocklen = 8;
1715
+
1716
+ assert (n % qk == 0);
1717
+ assert (nr % 4 == 0);
1718
+ assert (nc % ncols_interleaved == 0);
1719
+
1720
+ UNUSED(s);
1721
+ UNUSED(bs);
1722
+ UNUSED(vx);
1723
+ UNUSED(vy);
1724
+ UNUSED(nr);
1725
+ UNUSED(nc);
1726
+ UNUSED(nb);
1727
+ UNUSED(ncols_interleaved);
1728
+ UNUSED(blocklen);
1729
+
1730
+ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1731
+ if (svcntw() == 8) {
1732
+ const void * b_ptr = vx;
1733
+ const void * a_ptr = vy;
1734
+ float * res_ptr = s;
1735
+ size_t res_stride = bs * sizeof(float);
1736
+
1737
+ __asm__ __volatile__(
1738
+ "mov x20, #0x4\n"
1739
+ "mov x13, %x[nr]\n"
1740
+ "mov z28.s, #-0x4\n"
1741
+ "mov x12, #0x88\n"
1742
+ "ptrue p1.b\n"
1743
+ "whilelt p0.s, XZR, x20\n"
1744
+ "cmp x13, #0x10\n"
1745
+ "mul x12, %x[nb], x12\n"
1746
+ "blt 4f\n"
1747
+ "1:" // Row loop
1748
+ "add x11, %x[b_ptr], #0x10\n"
1749
+ "mov x10, %x[nc]\n"
1750
+ "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
1751
+ "2:" // Column loop
1752
+ "add x28, %x[a_ptr], #0x8\n"
1753
+ "mov z24.b, #0x0\n"
1754
+ "mov z15.b, #0x0\n"
1755
+ "mov x27, %x[nb]\n"
1756
+ "add x26, x28, x12\n"
1757
+ "mov z12.b, #0x0\n"
1758
+ "mov z0.b, #0x0\n"
1759
+ "add x25, x26, x12\n"
1760
+ "mov z13.b, #0x0\n"
1761
+ "mov z1.b, #0x0\n"
1762
+ "add x24, x25, x12\n"
1763
+ "mov z20.b, #0x0\n"
1764
+ "mov z25.b, #0x0\n"
1765
+ "mov z11.b, #0x0\n"
1766
+ "mov z16.b, #0x0\n"
1767
+ "mov z19.b, #0x0\n"
1768
+ "mov z26.b, #0x0\n"
1769
+ "mov z8.b, #0x0\n"
1770
+ "mov z29.b, #0x0\n"
1771
+ "mov z27.b, #0x0\n"
1772
+ "mov z10.b, #0x0\n"
1773
+ "3:" // Block loop
1774
+ "ld1b { z30.b }, p1/Z, [x11]\n"
1775
+ "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
1776
+ "mov z18.s, #0x0\n"
1777
+ "mov z7.s, #0x0\n"
1778
+ "ld1rqb { z3.b }, p1/Z, [x28]\n"
1779
+ "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
1780
+ "mov z9.s, #0x0\n"
1781
+ "mov z22.s, #0x0\n"
1782
+ "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
1783
+ "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
1784
+ "sub x20, x11, #0x10\n"
1785
+ "sub x23, x28, #0x8\n"
1786
+ "lsl z31.b, z30.b, #0x4\n"
1787
+ "lsl z6.b, z21.b, #0x4\n"
1788
+ "ld1h { z23.s }, p1/Z, [x20]\n"
1789
+ "sub x22, x26, #0x8\n"
1790
+ "and z30.b, z30.b, #0xf0\n"
1791
+ "and z21.b, z21.b, #0xf0\n"
1792
+ "sub x21, x25, #0x8\n"
1793
+ "sub x20, x24, #0x8\n"
1794
+ "lsl z14.b, z4.b, #0x4\n"
1795
+ "lsl z2.b, z17.b, #0x4\n"
1796
+ "subs x27, x27, #0x1\n"
1797
+ "add x11, x11, #0x90\n"
1798
+ ".inst 0x451f9872 // smmla z18.s, z3.b, z31.b\n"
1799
+ ".inst 0x45069867 // smmla z7.s, z3.b, z6.b\n"
1800
+ "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
1801
+ "and z4.b, z4.b, #0xf0\n"
1802
+ ".inst 0x451f98a9 // smmla z9.s, z5.b, z31.b\n"
1803
+ ".inst 0x450698b6 // smmla z22.s, z5.b, z6.b\n"
1804
+ "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
1805
+ "and z17.b, z17.b, #0xf0\n"
1806
+ "fcvt z23.s, p1/m, z23.h\n"
1807
+ ".inst 0x450e9872 // smmla z18.s, z3.b, z14.b\n"
1808
+ ".inst 0x45029867 // smmla z7.s, z3.b, z2.b\n"
1809
+ "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
1810
+ ".inst 0x450e98a9 // smmla z9.s, z5.b, z14.b\n"
1811
+ ".inst 0x450298b6 // smmla z22.s, z5.b, z2.b\n"
1812
+ "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
1813
+ "fscale z23.s, p1/m, z23.s, z28.s\n"
1814
+ ".inst 0x451e9872 // smmla z18.s, z3.b, z30.b\n"
1815
+ ".inst 0x45159867 // smmla z7.s, z3.b, z21.b\n"
1816
+ "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
1817
+ ".inst 0x451e98a9 // smmla z9.s, z5.b, z30.b\n"
1818
+ ".inst 0x451598b6 // smmla z22.s, z5.b, z21.b\n"
1819
+ "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
1820
+ "add x28, x28, #0x88\n"
1821
+ ".inst 0x45049872 // smmla z18.s, z3.b, z4.b\n"
1822
+ ".inst 0x45119867 // smmla z7.s, z3.b, z17.b\n"
1823
+ "ld1h { z3.s }, p0/Z, [x23]\n"
1824
+ ".inst 0x450498a9 // smmla z9.s, z5.b, z4.b\n"
1825
+ ".inst 0x451198b6 // smmla z22.s, z5.b, z17.b\n"
1826
+ "fcvt z3.s, p1/m, z3.h\n"
1827
+ "uzp1 z5.d, z18.d, z7.d\n"
1828
+ "uzp2 z18.d, z18.d, z7.d\n"
1829
+ "mov z3.q, z3.q[0]\n"
1830
+ "uzp1 z7.d, z9.d, z22.d\n"
1831
+ "uzp2 z22.d, z9.d, z22.d\n"
1832
+ "fmul z9.s, z23.s, z3.s[0]\n"
1833
+ "scvtf z5.s, p1/m, z5.s\n"
1834
+ "scvtf z18.s, p1/m, z18.s\n"
1835
+ "scvtf z7.s, p1/m, z7.s\n"
1836
+ "scvtf z22.s, p1/m, z22.s\n"
1837
+ "fmla z24.s, p1/M, z5.s, z9.s\n"
1838
+ "ld1rqb { z5.b }, p1/Z, [x26]\n"
1839
+ "fmul z9.s, z23.s, z3.s[1]\n"
1840
+ "fmla z15.s, p1/M, z18.s, z9.s\n"
1841
+ "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
1842
+ "fmul z9.s, z23.s, z3.s[2]\n"
1843
+ "fmul z3.s, z23.s, z3.s[3]\n"
1844
+ "fmla z12.s, p1/M, z7.s, z9.s\n"
1845
+ "mov z9.s, #0x0\n"
1846
+ "ld1h { z7.s }, p0/Z, [x22]\n"
1847
+ ".inst 0x451f98a9 // smmla z9.s, z5.b, z31.b\n"
1848
+ "fmla z0.s, p1/M, z22.s, z3.s\n"
1849
+ "mov z22.s, #0x0\n"
1850
+ "ld1h { z3.s }, p0/Z, [x21]\n"
1851
+ ".inst 0x450698b6 // smmla z22.s, z5.b, z6.b\n"
1852
+ "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
1853
+ "fcvt z7.s, p1/m, z7.h\n"
1854
+ "fcvt z3.s, p1/m, z3.h\n"
1855
+ ".inst 0x450e98a9 // smmla z9.s, z5.b, z14.b\n"
1856
+ ".inst 0x450298b6 // smmla z22.s, z5.b, z2.b\n"
1857
+ "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
1858
+ "mov z7.q, z7.q[0]\n"
1859
+ "mov z3.q, z3.q[0]\n"
1860
+ ".inst 0x451e98a9 // smmla z9.s, z5.b, z30.b\n"
1861
+ ".inst 0x451598b6 // smmla z22.s, z5.b, z21.b\n"
1862
+ "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
1863
+ ".inst 0x450498a9 // smmla z9.s, z5.b, z4.b\n"
1864
+ ".inst 0x451198b6 // smmla z22.s, z5.b, z17.b\n"
1865
+ "uzp1 z5.d, z9.d, z22.d\n"
1866
+ "scvtf z5.s, p1/m, z5.s\n"
1867
+ "uzp2 z22.d, z9.d, z22.d\n"
1868
+ "fmul z9.s, z23.s, z7.s[0]\n"
1869
+ "scvtf z22.s, p1/m, z22.s\n"
1870
+ "fmla z13.s, p1/M, z5.s, z9.s\n"
1871
+ "ld1rqb { z9.b }, p1/Z, [x25]\n"
1872
+ "fmul z5.s, z23.s, z7.s[1]\n"
1873
+ "fmla z1.s, p1/M, z22.s, z5.s\n"
1874
+ "mov z5.s, #0x0\n"
1875
+ "mov z22.s, #0x0\n"
1876
+ ".inst 0x451f9a45 // smmla z5.s, z18.b, z31.b\n"
1877
+ ".inst 0x45069a56 // smmla z22.s, z18.b, z6.b\n"
1878
+ "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
1879
+ ".inst 0x450e9a45 // smmla z5.s, z18.b, z14.b\n"
1880
+ ".inst 0x45029a56 // smmla z22.s, z18.b, z2.b\n"
1881
+ "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
1882
+ ".inst 0x451e9a45 // smmla z5.s, z18.b, z30.b\n"
1883
+ ".inst 0x45159a56 // smmla z22.s, z18.b, z21.b\n"
1884
+ "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
1885
+ "add x26, x26, #0x88\n"
1886
+ ".inst 0x45049a45 // smmla z5.s, z18.b, z4.b\n"
1887
+ ".inst 0x45119a56 // smmla z22.s, z18.b, z17.b\n"
1888
+ "uzp1 z18.d, z5.d, z22.d\n"
1889
+ "scvtf z18.s, p1/m, z18.s\n"
1890
+ "uzp2 z22.d, z5.d, z22.d\n"
1891
+ "fmul z5.s, z23.s, z7.s[2]\n"
1892
+ "fmul z7.s, z23.s, z7.s[3]\n"
1893
+ "scvtf z22.s, p1/m, z22.s\n"
1894
+ "fmla z20.s, p1/M, z18.s, z5.s\n"
1895
+ "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
1896
+ "ld1h { z5.s }, p0/Z, [x20]\n"
1897
+ "fcvt z5.s, p1/m, z5.h\n"
1898
+ "fmla z25.s, p1/M, z22.s, z7.s\n"
1899
+ "mov z22.s, #0x0\n"
1900
+ "mov z7.s, #0x0\n"
1901
+ ".inst 0x451f9936 // smmla z22.s, z9.b, z31.b\n"
1902
+ ".inst 0x45069927 // smmla z7.s, z9.b, z6.b\n"
1903
+ "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
1904
+ "mov z5.q, z5.q[0]\n"
1905
+ ".inst 0x450e9936 // smmla z22.s, z9.b, z14.b\n"
1906
+ ".inst 0x45029927 // smmla z7.s, z9.b, z2.b\n"
1907
+ "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
1908
+ ".inst 0x451e9936 // smmla z22.s, z9.b, z30.b\n"
1909
+ ".inst 0x45159927 // smmla z7.s, z9.b, z21.b\n"
1910
+ "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
1911
+ ".inst 0x45049936 // smmla z22.s, z9.b, z4.b\n"
1912
+ ".inst 0x45119927 // smmla z7.s, z9.b, z17.b\n"
1913
+ "uzp1 z9.d, z22.d, z7.d\n"
1914
+ "scvtf z9.s, p1/m, z9.s\n"
1915
+ "uzp2 z22.d, z22.d, z7.d\n"
1916
+ "fmul z7.s, z23.s, z3.s[0]\n"
1917
+ "scvtf z22.s, p1/m, z22.s\n"
1918
+ "fmla z11.s, p1/M, z9.s, z7.s\n"
1919
+ "ld1rqb { z9.b }, p1/Z, [x24]\n"
1920
+ "fmul z7.s, z23.s, z3.s[1]\n"
1921
+ "fmla z16.s, p1/M, z22.s, z7.s\n"
1922
+ "mov z22.s, #0x0\n"
1923
+ "mov z7.s, #0x0\n"
1924
+ ".inst 0x451f9a56 // smmla z22.s, z18.b, z31.b\n"
1925
+ ".inst 0x45069a47 // smmla z7.s, z18.b, z6.b\n"
1926
+ "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
1927
+ ".inst 0x450e9a56 // smmla z22.s, z18.b, z14.b\n"
1928
+ ".inst 0x45029a47 // smmla z7.s, z18.b, z2.b\n"
1929
+ "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
1930
+ ".inst 0x451e9a56 // smmla z22.s, z18.b, z30.b\n"
1931
+ ".inst 0x45159a47 // smmla z7.s, z18.b, z21.b\n"
1932
+ "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
1933
+ "add x25, x25, #0x88\n"
1934
+ ".inst 0x45049a56 // smmla z22.s, z18.b, z4.b\n"
1935
+ ".inst 0x45119a47 // smmla z7.s, z18.b, z17.b\n"
1936
+ "uzp1 z18.d, z22.d, z7.d\n"
1937
+ "scvtf z18.s, p1/m, z18.s\n"
1938
+ "uzp2 z7.d, z22.d, z7.d\n"
1939
+ "fmul z22.s, z23.s, z3.s[2]\n"
1940
+ "fmul z3.s, z23.s, z3.s[3]\n"
1941
+ "scvtf z7.s, p1/m, z7.s\n"
1942
+ "fmla z19.s, p1/M, z18.s, z22.s\n"
1943
+ "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
1944
+ "fmul z22.s, z23.s, z5.s[0]\n"
1945
+ "fmla z26.s, p1/M, z7.s, z3.s\n"
1946
+ "mov z3.s, #0x0\n"
1947
+ "mov z7.s, #0x0\n"
1948
+ ".inst 0x451f9923 // smmla z3.s, z9.b, z31.b\n"
1949
+ ".inst 0x45069927 // smmla z7.s, z9.b, z6.b\n"
1950
+ "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
1951
+ ".inst 0x450e9923 // smmla z3.s, z9.b, z14.b\n"
1952
+ ".inst 0x45029927 // smmla z7.s, z9.b, z2.b\n"
1953
+ "mov z9.s, #0x0\n"
1954
+ ".inst 0x451f9a49 // smmla z9.s, z18.b, z31.b\n"
1955
+ "mov z31.s, #0x0\n"
1956
+ ".inst 0x45069a5f // smmla z31.s, z18.b, z6.b\n"
1957
+ "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
1958
+ "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
1959
+ ".inst 0x450e98c9 // smmla z9.s, z6.b, z14.b\n"
1960
+ "fmul z14.s, z23.s, z5.s[1]\n"
1961
+ ".inst 0x450298df // smmla z31.s, z6.b, z2.b\n"
1962
+ "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
1963
+ "fmul z2.s, z23.s, z5.s[2]\n"
1964
+ "fmul z23.s, z23.s, z5.s[3]\n"
1965
+ ".inst 0x451e9a43 // smmla z3.s, z18.b, z30.b\n"
1966
+ ".inst 0x45159a47 // smmla z7.s, z18.b, z21.b\n"
1967
+ "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
1968
+ ".inst 0x451e98c9 // smmla z9.s, z6.b, z30.b\n"
1969
+ ".inst 0x451598df // smmla z31.s, z6.b, z21.b\n"
1970
+ "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
1971
+ "add x24, x24, #0x88\n"
1972
+ ".inst 0x450498a3 // smmla z3.s, z5.b, z4.b\n"
1973
+ ".inst 0x451198a7 // smmla z7.s, z5.b, z17.b\n"
1974
+ ".inst 0x45049a49 // smmla z9.s, z18.b, z4.b\n"
1975
+ ".inst 0x45119a5f // smmla z31.s, z18.b, z17.b\n"
1976
+ "uzp1 z18.d, z3.d, z7.d\n"
1977
+ "uzp2 z5.d, z3.d, z7.d\n"
1978
+ "scvtf z18.s, p1/m, z18.s\n"
1979
+ "uzp1 z6.d, z9.d, z31.d\n"
1980
+ "uzp2 z9.d, z9.d, z31.d\n"
1981
+ "scvtf z5.s, p1/m, z5.s\n"
1982
+ "fmla z8.s, p1/M, z18.s, z22.s\n"
1983
+ "scvtf z6.s, p1/m, z6.s\n"
1984
+ "scvtf z9.s, p1/m, z9.s\n"
1985
+ "fmla z29.s, p1/M, z5.s, z14.s\n"
1986
+ "fmla z27.s, p1/M, z6.s, z2.s\n"
1987
+ "fmla z10.s, p1/M, z9.s, z23.s\n"
1988
+ "bgt 3b\n"
1989
+ "mov x20, %x[res_ptr]\n"
1990
+ "subs x10, x10, #0x8\n"
1991
+ "add %x[res_ptr], %x[res_ptr], #0x20\n"
1992
+ "st1w { z24.s }, p1, [x20]\n"
1993
+ "add x20, x20, %x[res_stride]\n"
1994
+ "st1w { z15.s }, p1, [x20]\n"
1995
+ "add x20, x20, %x[res_stride]\n"
1996
+ "st1w { z12.s }, p1, [x20]\n"
1997
+ "add x20, x20, %x[res_stride]\n"
1998
+ "st1w { z0.s }, p1, [x20]\n"
1999
+ "add x20, x20, %x[res_stride]\n"
2000
+ "st1w { z13.s }, p1, [x20]\n"
2001
+ "add x20, x20, %x[res_stride]\n"
2002
+ "st1w { z1.s }, p1, [x20]\n"
2003
+ "add x20, x20, %x[res_stride]\n"
2004
+ "st1w { z20.s }, p1, [x20]\n"
2005
+ "add x20, x20, %x[res_stride]\n"
2006
+ "st1w { z25.s }, p1, [x20]\n"
2007
+ "add x20, x20, %x[res_stride]\n"
2008
+ "st1w { z11.s }, p1, [x20]\n"
2009
+ "add x20, x20, %x[res_stride]\n"
2010
+ "st1w { z16.s }, p1, [x20]\n"
2011
+ "add x20, x20, %x[res_stride]\n"
2012
+ "st1w { z19.s }, p1, [x20]\n"
2013
+ "add x20, x20, %x[res_stride]\n"
2014
+ "st1w { z26.s }, p1, [x20]\n"
2015
+ "add x20, x20, %x[res_stride]\n"
2016
+ "st1w { z8.s }, p1, [x20]\n"
2017
+ "add x20, x20, %x[res_stride]\n"
2018
+ "st1w { z29.s }, p1, [x20]\n"
2019
+ "add x20, x20, %x[res_stride]\n"
2020
+ "st1w { z27.s }, p1, [x20]\n"
2021
+ "add x20, x20, %x[res_stride]\n"
2022
+ "st1w { z10.s }, p1, [x20]\n"
2023
+ "bne 2b\n"
2024
+ "mov x20, #0x4\n"
2025
+ "sub x13, x13, #0x10\n"
2026
+ "cmp x13, #0x10\n"
2027
+ "mov %x[res_ptr], x9\n"
2028
+ "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
2029
+ "bge 1b\n"
2030
+ "4:" // Row loop skip
2031
+ "cbz x13, 9f\n"
2032
+ "5:" // Row tail: Row loop
2033
+ "add x25, %x[b_ptr], #0x10\n"
2034
+ "mov x24, %x[nc]\n"
2035
+ "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
2036
+ "6:" // Row tail: Column loop
2037
+ "mov z24.b, #0x0\n"
2038
+ "mov z15.b, #0x0\n"
2039
+ "add x28, %x[a_ptr], #0x8\n"
2040
+ "mov x22, %x[nb]\n"
2041
+ "mov z12.b, #0x0\n"
2042
+ "mov z0.b, #0x0\n"
2043
+ "7:" // Row tail: Block loop
2044
+ "ld1b { z3.b }, p1/Z, [x25]\n"
2045
+ "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
2046
+ "mov z2.s, #0x0\n"
2047
+ "mov z25.s, #0x0\n"
2048
+ "ld1rqb { z26.b }, p1/Z, [x28]\n"
2049
+ "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
2050
+ "mov z27.s, #0x0\n"
2051
+ "mov z19.s, #0x0\n"
2052
+ "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
2053
+ "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
2054
+ "sub x21, x25, #0x10\n"
2055
+ "sub x20, x28, #0x8\n"
2056
+ "lsl z20.b, z3.b, #0x4\n"
2057
+ "lsl z4.b, z6.b, #0x4\n"
2058
+ "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
2059
+ "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
2060
+ "and z3.b, z3.b, #0xf0\n"
2061
+ "and z6.b, z6.b, #0xf0\n"
2062
+ "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
2063
+ "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
2064
+ "lsl z8.b, z29.b, #0x4\n"
2065
+ "lsl z14.b, z16.b, #0x4\n"
2066
+ "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
2067
+ "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
2068
+ ".inst 0x45149b42 // smmla z2.s, z26.b, z20.b\n"
2069
+ ".inst 0x45049b59 // smmla z25.s, z26.b, z4.b\n"
2070
+ "and z29.b, z29.b, #0xf0\n"
2071
+ "ld1h { z17.s }, p1/Z, [x21]\n"
2072
+ ".inst 0x45149abb // smmla z27.s, z21.b, z20.b\n"
2073
+ ".inst 0x45049ab3 // smmla z19.s, z21.b, z4.b\n"
2074
+ "and z16.b, z16.b, #0xf0\n"
2075
+ "ld1h { z4.s }, p0/Z, [x20]\n"
2076
+ "subs x22, x22, #0x1\n"
2077
+ "add x28, x28, #0x88\n"
2078
+ "fcvt z17.s, p1/m, z17.h\n"
2079
+ "add x25, x25, #0x90\n"
2080
+ ".inst 0x45089942 // smmla z2.s, z10.b, z8.b\n"
2081
+ ".inst 0x450e9959 // smmla z25.s, z10.b, z14.b\n"
2082
+ "fcvt z4.s, p1/m, z4.h\n"
2083
+ ".inst 0x45089afb // smmla z27.s, z23.b, z8.b\n"
2084
+ ".inst 0x450e9af3 // smmla z19.s, z23.b, z14.b\n"
2085
+ "fscale z17.s, p1/m, z17.s, z28.s\n"
2086
+ "mov z4.q, z4.q[0]\n"
2087
+ ".inst 0x45039962 // smmla z2.s, z11.b, z3.b\n"
2088
+ ".inst 0x45069979 // smmla z25.s, z11.b, z6.b\n"
2089
+ "fmul z23.s, z17.s, z4.s[0]\n"
2090
+ "fmul z9.s, z17.s, z4.s[1]\n"
2091
+ "fmul z21.s, z17.s, z4.s[2]\n"
2092
+ "fmul z4.s, z17.s, z4.s[3]\n"
2093
+ ".inst 0x450398fb // smmla z27.s, z7.b, z3.b\n"
2094
+ ".inst 0x450698f3 // smmla z19.s, z7.b, z6.b\n"
2095
+ ".inst 0x451d9a42 // smmla z2.s, z18.b, z29.b\n"
2096
+ ".inst 0x45109a59 // smmla z25.s, z18.b, z16.b\n"
2097
+ ".inst 0x451d9bdb // smmla z27.s, z30.b, z29.b\n"
2098
+ ".inst 0x45109bd3 // smmla z19.s, z30.b, z16.b\n"
2099
+ "uzp1 z31.d, z2.d, z25.d\n"
2100
+ "uzp2 z13.d, z2.d, z25.d\n"
2101
+ "scvtf z31.s, p1/m, z31.s\n"
2102
+ "uzp1 z17.d, z27.d, z19.d\n"
2103
+ "uzp2 z18.d, z27.d, z19.d\n"
2104
+ "scvtf z13.s, p1/m, z13.s\n"
2105
+ "fmla z24.s, p1/M, z31.s, z23.s\n"
2106
+ "scvtf z17.s, p1/m, z17.s\n"
2107
+ "scvtf z18.s, p1/m, z18.s\n"
2108
+ "fmla z15.s, p1/M, z13.s, z9.s\n"
2109
+ "fmla z12.s, p1/M, z17.s, z21.s\n"
2110
+ "fmla z0.s, p1/M, z18.s, z4.s\n"
2111
+ "bgt 7b\n"
2112
+ "mov x20, %x[res_ptr]\n"
2113
+ "cmp x13, #0x1\n"
2114
+ "st1w { z24.s }, p1, [x20]\n"
2115
+ "add x20, x20, %x[res_stride]\n"
2116
+ "ble 8f\n"
2117
+ "cmp x13, #0x2\n"
2118
+ "st1w { z15.s }, p1, [x20]\n"
2119
+ "add x20, x20, %x[res_stride]\n"
2120
+ "ble 8f\n"
2121
+ "cmp x13, #0x3\n"
2122
+ "st1w { z12.s }, p1, [x20]\n"
2123
+ "add x20, x20, %x[res_stride]\n"
2124
+ "ble 8f\n"
2125
+ "st1w { z0.s }, p1, [x20]\n"
2126
+ "8:" // Row tail: Accumulator store skip
2127
+ "subs x24, x24, #0x8\n"
2128
+ "add %x[res_ptr], %x[res_ptr], #0x20\n"
2129
+ "bne 6b\n"
2130
+ "subs x13, x13, #0x4\n"
2131
+ "add %x[a_ptr], %x[a_ptr], x12\n"
2132
+ "mov %x[res_ptr], x23\n"
2133
+ "bgt 5b\n"
2134
+ "9:" // Row tail: Row loop skip
2135
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
2136
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
2137
+ : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
2138
+ );
2139
+ return;
2140
+ }
2141
+ else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2142
+ GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
2143
+ "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
2144
+ "performance");
2145
+ }
2146
+ else if (ggml_cpu_has_neon()) {
2147
+ GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
2148
+ "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
2149
+ "quantization format for optimal performance");
2150
+ }
2151
+ #endif
2152
+ #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
2153
+ GGML_ASSERT(ggml_cpu_has_sve() &&
2154
+ "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
2155
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
2156
+ GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
2157
+ "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
2158
+ "performance");
2159
+ #else
2160
+ float sumf[4][8];
2161
+ int sumi;
2162
+
2163
+ for (int y = 0; y < nr / 4; y++) {
2164
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2165
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2166
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
2167
+ for (int m = 0; m < 4; m++) {
2168
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2169
+ }
2170
+ for (int l = 0; l < nb; l++) {
2171
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2172
+ for (int m = 0; m < 4; m++) {
2173
+ for (int j = 0; j < ncols_interleaved; j++) {
2174
+ sumi = 0;
2175
+ for (int i = 0; i < blocklen; ++i) {
2176
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
2177
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2178
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2179
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2180
+ }
2181
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
2182
+ }
2183
+ }
2184
+ }
2185
+ }
2186
+ for (int m = 0; m < 4; m++) {
2187
+ for (int j = 0; j < ncols_interleaved; j++)
2188
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2189
+ }
2190
+ }
2191
+ }
2192
+ #endif
2193
+ }
ggml/src/ggml-aarch64.h ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
2
+ #pragma once
3
+
4
+ #define GGML_COMMON_DECL_C
5
+ #include "ggml-common.h"
6
+
7
+ #include "ggml.h"
8
+
9
+ // GGML internal header
10
+
11
+ #ifdef __cplusplus
12
+ extern "C" {
13
+ #endif
14
+
15
+ // Quantization
16
+ void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
17
+ void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
18
+
19
+ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
20
+
21
+ // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
22
+ size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
23
+ size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
24
+ size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
25
+
26
+ // GEMV
27
+ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
28
+ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
29
+ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
30
+
31
+ // GEMM
32
+ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
33
+ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
34
+ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
35
+
36
+ #ifdef __cplusplus
37
+ }
38
+ #endif
39
+