Spaces:
Running
Running
Dan Johansson
commited on
Commit
·
abf6f22
1
Parent(s):
2868c2b
ggml : optimize Q4_0 into Q4_0_X_Y repack (llama/10324)
Browse files- ggml/src/ggml-aarch64.c +43 -14
- ggml/src/ggml-cpu/ggml-cpu-aarch64.c +43 -14
ggml/src/ggml-aarch64.c
CHANGED
|
@@ -8,19 +8,42 @@
|
|
| 8 |
|
| 9 |
#define UNUSED GGML_UNUSED
|
| 10 |
|
| 11 |
-
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave
|
| 12 |
block_q4_0x4 out;
|
| 13 |
|
| 14 |
for (int i = 0; i < 4; i++) {
|
| 15 |
out.d[i] = in[i].d;
|
| 16 |
}
|
| 17 |
|
| 18 |
-
|
| 19 |
-
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
|
| 20 |
-
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
|
| 21 |
-
src_offset += (i % blck_size_interleave);
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
}
|
| 25 |
|
| 26 |
return out;
|
|
@@ -30,19 +53,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
|
|
| 30 |
// returns an interleaved block_q4_0x8
|
| 31 |
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
| 32 |
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
| 33 |
-
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave
|
| 34 |
block_q4_0x8 out;
|
| 35 |
|
| 36 |
for (int i = 0; i < 8; i++) {
|
| 37 |
out.d[i] = in[i].d;
|
| 38 |
}
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
| 46 |
}
|
| 47 |
|
| 48 |
return out;
|
|
@@ -71,11 +100,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
|
|
| 71 |
}
|
| 72 |
|
| 73 |
if (nrows_interleaved == 8) {
|
| 74 |
-
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave
|
| 75 |
out_ptr = (block_q4_0x8 *) out_ptr + 1;
|
| 76 |
}
|
| 77 |
else if (nrows_interleaved == 4) {
|
| 78 |
-
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave
|
| 79 |
out_ptr = (block_q4_0x4 *) out_ptr + 1;
|
| 80 |
}
|
| 81 |
}
|
|
|
|
| 8 |
|
| 9 |
#define UNUSED GGML_UNUSED
|
| 10 |
|
| 11 |
+
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
| 12 |
block_q4_0x4 out;
|
| 13 |
|
| 14 |
for (int i = 0; i < 4; i++) {
|
| 15 |
out.d[i] = in[i].d;
|
| 16 |
}
|
| 17 |
|
| 18 |
+
const int end = QK4_0 * 2 / blck_size_interleave;
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
if (blck_size_interleave == 8) {
|
| 21 |
+
const uint64_t xor_mask = 0x8888888888888888ULL;
|
| 22 |
+
for (int i = 0; i < end; ++i) {
|
| 23 |
+
int src_id = i % 4;
|
| 24 |
+
int src_offset = (i / 4) * blck_size_interleave;
|
| 25 |
+
int dst_offset = i * blck_size_interleave;
|
| 26 |
+
|
| 27 |
+
uint64_t elems;
|
| 28 |
+
// Using memcpy to avoid unaligned memory accesses
|
| 29 |
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
| 30 |
+
elems ^= xor_mask;
|
| 31 |
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
| 32 |
+
}
|
| 33 |
+
} else if (blck_size_interleave == 4) {
|
| 34 |
+
const uint32_t xor_mask = 0x88888888;
|
| 35 |
+
for (int i = 0; i < end; ++i) {
|
| 36 |
+
int src_id = i % 4;
|
| 37 |
+
int src_offset = (i / 4) * blck_size_interleave;
|
| 38 |
+
int dst_offset = i * blck_size_interleave;
|
| 39 |
+
|
| 40 |
+
uint32_t elems;
|
| 41 |
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
|
| 42 |
+
elems ^= xor_mask;
|
| 43 |
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
|
| 44 |
+
}
|
| 45 |
+
} else {
|
| 46 |
+
GGML_ASSERT(false);
|
| 47 |
}
|
| 48 |
|
| 49 |
return out;
|
|
|
|
| 53 |
// returns an interleaved block_q4_0x8
|
| 54 |
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
| 55 |
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
| 56 |
+
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
|
| 57 |
block_q4_0x8 out;
|
| 58 |
|
| 59 |
for (int i = 0; i < 8; i++) {
|
| 60 |
out.d[i] = in[i].d;
|
| 61 |
}
|
| 62 |
|
| 63 |
+
const int end = QK4_0 * 4 / blck_size_interleave;
|
| 64 |
+
const uint64_t xor_mask = 0x8888888888888888ULL;
|
| 65 |
+
|
| 66 |
+
for (int i = 0; i < end; ++i) {
|
| 67 |
+
int src_id = i % 8;
|
| 68 |
+
int src_offset = (i / 8) * blck_size_interleave;
|
| 69 |
+
int dst_offset = i * blck_size_interleave;
|
| 70 |
|
| 71 |
+
uint64_t elems;
|
| 72 |
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
| 73 |
+
elems ^= xor_mask;
|
| 74 |
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
| 75 |
}
|
| 76 |
|
| 77 |
return out;
|
|
|
|
| 100 |
}
|
| 101 |
|
| 102 |
if (nrows_interleaved == 8) {
|
| 103 |
+
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
|
| 104 |
out_ptr = (block_q4_0x8 *) out_ptr + 1;
|
| 105 |
}
|
| 106 |
else if (nrows_interleaved == 4) {
|
| 107 |
+
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
|
| 108 |
out_ptr = (block_q4_0x4 *) out_ptr + 1;
|
| 109 |
}
|
| 110 |
}
|
ggml/src/ggml-cpu/ggml-cpu-aarch64.c
CHANGED
|
@@ -3387,19 +3387,42 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 3387 |
}
|
| 3388 |
|
| 3389 |
// FIXME: this code is duplicated from ggml-aarch64.c
|
| 3390 |
-
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave
|
| 3391 |
block_q4_0x4 out;
|
| 3392 |
|
| 3393 |
for (int i = 0; i < 4; i++) {
|
| 3394 |
out.d[i] = in[i].d;
|
| 3395 |
}
|
| 3396 |
|
| 3397 |
-
|
| 3398 |
-
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
|
| 3399 |
-
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
|
| 3400 |
-
src_offset += (i % blck_size_interleave);
|
| 3401 |
|
| 3402 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3403 |
}
|
| 3404 |
|
| 3405 |
return out;
|
|
@@ -3409,19 +3432,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
|
|
| 3409 |
// returns an interleaved block_q4_0x8
|
| 3410 |
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
| 3411 |
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
| 3412 |
-
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave
|
| 3413 |
block_q4_0x8 out;
|
| 3414 |
|
| 3415 |
for (int i = 0; i < 8; i++) {
|
| 3416 |
out.d[i] = in[i].d;
|
| 3417 |
}
|
| 3418 |
|
| 3419 |
-
|
| 3420 |
-
|
| 3421 |
-
|
| 3422 |
-
|
|
|
|
|
|
|
|
|
|
| 3423 |
|
| 3424 |
-
|
|
|
|
|
|
|
|
|
|
| 3425 |
}
|
| 3426 |
|
| 3427 |
return out;
|
|
@@ -3449,7 +3478,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
|
|
| 3449 |
for (int i = 0; i < nrows_interleaved; i++) {
|
| 3450 |
dst_tmp[i] = src[x + i * nblocks];
|
| 3451 |
}
|
| 3452 |
-
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block
|
| 3453 |
}
|
| 3454 |
src += nrows_interleaved * nblocks;
|
| 3455 |
}
|
|
@@ -3480,7 +3509,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
|
|
| 3480 |
for (int i = 0; i < nrows_interleaved; i++ ) {
|
| 3481 |
dst_tmp[i] = src[x + i * nblocks];
|
| 3482 |
}
|
| 3483 |
-
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block
|
| 3484 |
}
|
| 3485 |
src += nrows_interleaved * nblocks;
|
| 3486 |
}
|
|
|
|
| 3387 |
}
|
| 3388 |
|
| 3389 |
// FIXME: this code is duplicated from ggml-aarch64.c
|
| 3390 |
+
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
| 3391 |
block_q4_0x4 out;
|
| 3392 |
|
| 3393 |
for (int i = 0; i < 4; i++) {
|
| 3394 |
out.d[i] = in[i].d;
|
| 3395 |
}
|
| 3396 |
|
| 3397 |
+
const int end = QK4_0 * 2 / blck_size_interleave;
|
|
|
|
|
|
|
|
|
|
| 3398 |
|
| 3399 |
+
if (blck_size_interleave == 8) {
|
| 3400 |
+
const uint64_t xor_mask = 0x8888888888888888ULL;
|
| 3401 |
+
for (int i = 0; i < end; ++i) {
|
| 3402 |
+
int src_id = i % 4;
|
| 3403 |
+
int src_offset = (i / 4) * blck_size_interleave;
|
| 3404 |
+
int dst_offset = i * blck_size_interleave;
|
| 3405 |
+
|
| 3406 |
+
uint64_t elems;
|
| 3407 |
+
// Using memcpy to avoid unaligned memory accesses
|
| 3408 |
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
| 3409 |
+
elems ^= xor_mask;
|
| 3410 |
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
| 3411 |
+
}
|
| 3412 |
+
} else if (blck_size_interleave == 4) {
|
| 3413 |
+
const uint32_t xor_mask = 0x88888888;
|
| 3414 |
+
for (int i = 0; i < end; ++i) {
|
| 3415 |
+
int src_id = i % 4;
|
| 3416 |
+
int src_offset = (i / 4) * blck_size_interleave;
|
| 3417 |
+
int dst_offset = i * blck_size_interleave;
|
| 3418 |
+
|
| 3419 |
+
uint32_t elems;
|
| 3420 |
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
|
| 3421 |
+
elems ^= xor_mask;
|
| 3422 |
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
|
| 3423 |
+
}
|
| 3424 |
+
} else {
|
| 3425 |
+
GGML_ASSERT(false);
|
| 3426 |
}
|
| 3427 |
|
| 3428 |
return out;
|
|
|
|
| 3432 |
// returns an interleaved block_q4_0x8
|
| 3433 |
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
| 3434 |
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
| 3435 |
+
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
|
| 3436 |
block_q4_0x8 out;
|
| 3437 |
|
| 3438 |
for (int i = 0; i < 8; i++) {
|
| 3439 |
out.d[i] = in[i].d;
|
| 3440 |
}
|
| 3441 |
|
| 3442 |
+
const int end = QK4_0 * 4 / blck_size_interleave;
|
| 3443 |
+
const uint64_t xor_mask = 0x8888888888888888ULL;
|
| 3444 |
+
|
| 3445 |
+
for (int i = 0; i < end; ++i) {
|
| 3446 |
+
int src_id = i % 8;
|
| 3447 |
+
int src_offset = (i / 8) * blck_size_interleave;
|
| 3448 |
+
int dst_offset = i * blck_size_interleave;
|
| 3449 |
|
| 3450 |
+
uint64_t elems;
|
| 3451 |
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
| 3452 |
+
elems ^= xor_mask;
|
| 3453 |
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
| 3454 |
}
|
| 3455 |
|
| 3456 |
return out;
|
|
|
|
| 3478 |
for (int i = 0; i < nrows_interleaved; i++) {
|
| 3479 |
dst_tmp[i] = src[x + i * nblocks];
|
| 3480 |
}
|
| 3481 |
+
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
|
| 3482 |
}
|
| 3483 |
src += nrows_interleaved * nblocks;
|
| 3484 |
}
|
|
|
|
| 3509 |
for (int i = 0; i < nrows_interleaved; i++ ) {
|
| 3510 |
dst_tmp[i] = src[x + i * nblocks];
|
| 3511 |
}
|
| 3512 |
+
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
|
| 3513 |
}
|
| 3514 |
src += nrows_interleaved * nblocks;
|
| 3515 |
}
|