Spaces:
Running
Running
ggml: bugfix: fix the inactive elements is agnostic for risc-v vector (llama/8748)
Browse filesIn these codes, we want to retain the value that they previously held
when mask[i] is false. So we should use undisturbed. With the default
agnostic policy of rvv intrinsic, these values can be held or be
written with 1s.
Co-authored-by: carter.li <[email protected]>
- ggml/src/ggml-quants.c +6 -6
ggml/src/ggml-quants.c
CHANGED
|
@@ -6449,22 +6449,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6449 |
// compute mask for subtraction
|
| 6450 |
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 6451 |
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
| 6452 |
-
vint8m1_t q3_m0 =
|
| 6453 |
m <<= 1;
|
| 6454 |
|
| 6455 |
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 6456 |
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
| 6457 |
-
vint8m1_t q3_m1 =
|
| 6458 |
m <<= 1;
|
| 6459 |
|
| 6460 |
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 6461 |
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
| 6462 |
-
vint8m1_t q3_m2 =
|
| 6463 |
m <<= 1;
|
| 6464 |
|
| 6465 |
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 6466 |
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
| 6467 |
-
vint8m1_t q3_m3 =
|
| 6468 |
m <<= 1;
|
| 6469 |
|
| 6470 |
// load Q8 and take product with Q3
|
|
@@ -7720,13 +7720,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7720 |
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
| 7721 |
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 7722 |
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
| 7723 |
-
vint8m1_t q5_m1 =
|
| 7724 |
m <<= 1;
|
| 7725 |
|
| 7726 |
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
| 7727 |
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 7728 |
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
| 7729 |
-
vint8m1_t q5_m2 =
|
| 7730 |
m <<= 1;
|
| 7731 |
|
| 7732 |
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
|
|
|
| 6449 |
// compute mask for subtraction
|
| 6450 |
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 6451 |
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
| 6452 |
+
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
|
| 6453 |
m <<= 1;
|
| 6454 |
|
| 6455 |
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 6456 |
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
| 6457 |
+
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
|
| 6458 |
m <<= 1;
|
| 6459 |
|
| 6460 |
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 6461 |
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
| 6462 |
+
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
|
| 6463 |
m <<= 1;
|
| 6464 |
|
| 6465 |
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 6466 |
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
| 6467 |
+
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
|
| 6468 |
m <<= 1;
|
| 6469 |
|
| 6470 |
// load Q8 and take product with Q3
|
|
|
|
| 7720 |
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
| 7721 |
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 7722 |
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
| 7723 |
+
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
|
| 7724 |
m <<= 1;
|
| 7725 |
|
| 7726 |
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
| 7727 |
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
| 7728 |
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
| 7729 |
+
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
|
| 7730 |
m <<= 1;
|
| 7731 |
|
| 7732 |
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|