Crad commited on
Commit
430efc6
·
unverified ·
1 Parent(s): 10ac4bb

ggml-quants : provide ggml_vqtbl1q_u8 for 64bit compatibility (llama/5711)

Browse files

* [ggml-quants] Provide ggml_vqtbl1q_u8 for 64bit compatibility

vqtbl1q_u8 is not part of arm v7 neon library

* [android-example] Remove abi filter after arm v7a fix

* [github-workflows] Do not skip Android armeabi-v7a build

Files changed (1) hide show
  1. ggml-quants.c +29 -4
ggml-quants.c CHANGED
@@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
462
  return res;
463
  }
464
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  #else
466
 
467
  #define ggml_int16x8x2_t int16x8x2_t
@@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
476
  #define ggml_vld1q_s8_x2 vld1q_s8_x2
477
  #define ggml_vld1q_s8_x4 vld1q_s8_x4
478
  #define ggml_vqtbl1q_s8 vqtbl1q_s8
 
479
 
480
  #endif
481
 
@@ -9488,8 +9513,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
9488
  qs += 16;
9489
 
9490
  vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
9491
- vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9492
- vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9493
  vs.val[0] = vceqq_u8(vs.val[0], mask2);
9494
  vs.val[1] = vceqq_u8(vs.val[1], mask2);
9495
 
@@ -9497,8 +9522,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
9497
  q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
9498
 
9499
  vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
9500
- vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9501
- vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9502
  vs.val[0] = vceqq_u8(vs.val[0], mask2);
9503
  vs.val[1] = vceqq_u8(vs.val[1], mask2);
9504
 
 
462
  return res;
463
  }
464
 
465
+ // NOTE: not tested
466
+ inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
467
+ int8x16_t res;
468
+
469
+ res[ 0] = a[b[ 0]];
470
+ res[ 1] = a[b[ 1]];
471
+ res[ 2] = a[b[ 2]];
472
+ res[ 3] = a[b[ 3]];
473
+ res[ 4] = a[b[ 4]];
474
+ res[ 5] = a[b[ 5]];
475
+ res[ 6] = a[b[ 6]];
476
+ res[ 7] = a[b[ 7]];
477
+ res[ 8] = a[b[ 8]];
478
+ res[ 9] = a[b[ 9]];
479
+ res[10] = a[b[10]];
480
+ res[11] = a[b[11]];
481
+ res[12] = a[b[12]];
482
+ res[13] = a[b[13]];
483
+ res[14] = a[b[14]];
484
+ res[15] = a[b[15]];
485
+
486
+ return res;
487
+ }
488
+
489
  #else
490
 
491
  #define ggml_int16x8x2_t int16x8x2_t
 
500
  #define ggml_vld1q_s8_x2 vld1q_s8_x2
501
  #define ggml_vld1q_s8_x4 vld1q_s8_x4
502
  #define ggml_vqtbl1q_s8 vqtbl1q_s8
503
+ #define ggml_vqtbl1q_u8 vqtbl1q_u8
504
 
505
  #endif
506
 
 
9513
  qs += 16;
9514
 
9515
  vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
9516
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9517
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9518
  vs.val[0] = vceqq_u8(vs.val[0], mask2);
9519
  vs.val[1] = vceqq_u8(vs.val[1], mask2);
9520
 
 
9522
  q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
9523
 
9524
  vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
9525
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9526
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9527
  vs.val[0] = vceqq_u8(vs.val[0], mask2);
9528
  vs.val[1] = vceqq_u8(vs.val[1], mask2);
9529