Dan Johansson commited on
Commit
abf6f22
·
1 Parent(s): 2868c2b

ggml : optimize Q4_0 into Q4_0_X_Y repack (llama/10324)

Browse files
ggml/src/ggml-aarch64.c CHANGED
@@ -8,19 +8,42 @@
8
 
9
  #define UNUSED GGML_UNUSED
10
 
11
- static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
12
  block_q4_0x4 out;
13
 
14
  for (int i = 0; i < 4; i++) {
15
  out.d[i] = in[i].d;
16
  }
17
 
18
- for (int i = 0; i < QK4_0 * 2; i++) {
19
- int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
20
- int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
21
- src_offset += (i % blck_size_interleave);
22
 
23
- out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
 
26
  return out;
@@ -30,19 +53,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
30
  // returns an interleaved block_q4_0x8
31
  // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
32
  // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
33
- static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
34
  block_q4_0x8 out;
35
 
36
  for (int i = 0; i < 8; i++) {
37
  out.d[i] = in[i].d;
38
  }
39
 
40
- for (int i = 0; i < QK4_0 * 4; i++) {
41
- int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
42
- int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
43
- src_offset += (i % blck_size_interleave);
 
 
 
44
 
45
- out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
 
 
 
46
  }
47
 
48
  return out;
@@ -71,11 +100,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
71
  }
72
 
73
  if (nrows_interleaved == 8) {
74
- *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave, 0x88);
75
  out_ptr = (block_q4_0x8 *) out_ptr + 1;
76
  }
77
  else if (nrows_interleaved == 4) {
78
- *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x88);
79
  out_ptr = (block_q4_0x4 *) out_ptr + 1;
80
  }
81
  }
 
8
 
9
  #define UNUSED GGML_UNUSED
10
 
11
+ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
12
  block_q4_0x4 out;
13
 
14
  for (int i = 0; i < 4; i++) {
15
  out.d[i] = in[i].d;
16
  }
17
 
18
+ const int end = QK4_0 * 2 / blck_size_interleave;
 
 
 
19
 
20
+ if (blck_size_interleave == 8) {
21
+ const uint64_t xor_mask = 0x8888888888888888ULL;
22
+ for (int i = 0; i < end; ++i) {
23
+ int src_id = i % 4;
24
+ int src_offset = (i / 4) * blck_size_interleave;
25
+ int dst_offset = i * blck_size_interleave;
26
+
27
+ uint64_t elems;
28
+ // Using memcpy to avoid unaligned memory accesses
29
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
30
+ elems ^= xor_mask;
31
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
32
+ }
33
+ } else if (blck_size_interleave == 4) {
34
+ const uint32_t xor_mask = 0x88888888;
35
+ for (int i = 0; i < end; ++i) {
36
+ int src_id = i % 4;
37
+ int src_offset = (i / 4) * blck_size_interleave;
38
+ int dst_offset = i * blck_size_interleave;
39
+
40
+ uint32_t elems;
41
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
42
+ elems ^= xor_mask;
43
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
44
+ }
45
+ } else {
46
+ GGML_ASSERT(false);
47
  }
48
 
49
  return out;
 
53
  // returns an interleaved block_q4_0x8
54
  // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
55
  // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
56
+ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
57
  block_q4_0x8 out;
58
 
59
  for (int i = 0; i < 8; i++) {
60
  out.d[i] = in[i].d;
61
  }
62
 
63
+ const int end = QK4_0 * 4 / blck_size_interleave;
64
+ const uint64_t xor_mask = 0x8888888888888888ULL;
65
+
66
+ for (int i = 0; i < end; ++i) {
67
+ int src_id = i % 8;
68
+ int src_offset = (i / 8) * blck_size_interleave;
69
+ int dst_offset = i * blck_size_interleave;
70
 
71
+ uint64_t elems;
72
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
73
+ elems ^= xor_mask;
74
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
75
  }
76
 
77
  return out;
 
100
  }
101
 
102
  if (nrows_interleaved == 8) {
103
+ *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
104
  out_ptr = (block_q4_0x8 *) out_ptr + 1;
105
  }
106
  else if (nrows_interleaved == 4) {
107
+ *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
108
  out_ptr = (block_q4_0x4 *) out_ptr + 1;
109
  }
110
  }
ggml/src/ggml-cpu/ggml-cpu-aarch64.c CHANGED
@@ -3387,19 +3387,42 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
3387
  }
3388
 
3389
  // FIXME: this code is duplicated from ggml-aarch64.c
3390
- static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
3391
  block_q4_0x4 out;
3392
 
3393
  for (int i = 0; i < 4; i++) {
3394
  out.d[i] = in[i].d;
3395
  }
3396
 
3397
- for (int i = 0; i < QK4_0 * 2; i++) {
3398
- int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
3399
- int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
3400
- src_offset += (i % blck_size_interleave);
3401
 
3402
- out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3403
  }
3404
 
3405
  return out;
@@ -3409,19 +3432,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
3409
  // returns an interleaved block_q4_0x8
3410
  // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
3411
  // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
3412
- static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
3413
  block_q4_0x8 out;
3414
 
3415
  for (int i = 0; i < 8; i++) {
3416
  out.d[i] = in[i].d;
3417
  }
3418
 
3419
- for (int i = 0; i < QK4_0 * 4; i++) {
3420
- int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
3421
- int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
3422
- src_offset += (i % blck_size_interleave);
 
 
 
3423
 
3424
- out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
 
 
 
3425
  }
3426
 
3427
  return out;
@@ -3449,7 +3478,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
3449
  for (int i = 0; i < nrows_interleaved; i++) {
3450
  dst_tmp[i] = src[x + i * nblocks];
3451
  }
3452
- *dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
3453
  }
3454
  src += nrows_interleaved * nblocks;
3455
  }
@@ -3480,7 +3509,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
3480
  for (int i = 0; i < nrows_interleaved; i++ ) {
3481
  dst_tmp[i] = src[x + i * nblocks];
3482
  }
3483
- *dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
3484
  }
3485
  src += nrows_interleaved * nblocks;
3486
  }
 
3387
  }
3388
 
3389
  // FIXME: this code is duplicated from ggml-aarch64.c
3390
+ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
3391
  block_q4_0x4 out;
3392
 
3393
  for (int i = 0; i < 4; i++) {
3394
  out.d[i] = in[i].d;
3395
  }
3396
 
3397
+ const int end = QK4_0 * 2 / blck_size_interleave;
 
 
 
3398
 
3399
+ if (blck_size_interleave == 8) {
3400
+ const uint64_t xor_mask = 0x8888888888888888ULL;
3401
+ for (int i = 0; i < end; ++i) {
3402
+ int src_id = i % 4;
3403
+ int src_offset = (i / 4) * blck_size_interleave;
3404
+ int dst_offset = i * blck_size_interleave;
3405
+
3406
+ uint64_t elems;
3407
+ // Using memcpy to avoid unaligned memory accesses
3408
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
3409
+ elems ^= xor_mask;
3410
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
3411
+ }
3412
+ } else if (blck_size_interleave == 4) {
3413
+ const uint32_t xor_mask = 0x88888888;
3414
+ for (int i = 0; i < end; ++i) {
3415
+ int src_id = i % 4;
3416
+ int src_offset = (i / 4) * blck_size_interleave;
3417
+ int dst_offset = i * blck_size_interleave;
3418
+
3419
+ uint32_t elems;
3420
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
3421
+ elems ^= xor_mask;
3422
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
3423
+ }
3424
+ } else {
3425
+ GGML_ASSERT(false);
3426
  }
3427
 
3428
  return out;
 
3432
  // returns an interleaved block_q4_0x8
3433
  // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
3434
  // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
3435
+ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
3436
  block_q4_0x8 out;
3437
 
3438
  for (int i = 0; i < 8; i++) {
3439
  out.d[i] = in[i].d;
3440
  }
3441
 
3442
+ const int end = QK4_0 * 4 / blck_size_interleave;
3443
+ const uint64_t xor_mask = 0x8888888888888888ULL;
3444
+
3445
+ for (int i = 0; i < end; ++i) {
3446
+ int src_id = i % 8;
3447
+ int src_offset = (i / 8) * blck_size_interleave;
3448
+ int dst_offset = i * blck_size_interleave;
3449
 
3450
+ uint64_t elems;
3451
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
3452
+ elems ^= xor_mask;
3453
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
3454
  }
3455
 
3456
  return out;
 
3478
  for (int i = 0; i < nrows_interleaved; i++) {
3479
  dst_tmp[i] = src[x + i * nblocks];
3480
  }
3481
+ *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
3482
  }
3483
  src += nrows_interleaved * nblocks;
3484
  }
 
3509
  for (int i = 0; i < nrows_interleaved; i++ ) {
3510
  dst_tmp[i] = src[x + i * nblocks];
3511
  }
3512
+ *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
3513
  }
3514
  src += nrows_interleaved * nblocks;
3515
  }