Spaces:
Running
Running
jdomke
domke
commited on
Commit
·
c26339f
1
Parent(s):
8b10f59
ggml : reading the runtime sve config of the cpu (llama/8709)
Browse files* ggml : reading the runtime sve config of the cpu
* change to one time init to prevent performance drop
* prefix variable to avoid possible conflicts
* revert xxhash fix and add brackets
---------
Co-authored-by: domke <[email protected]>
- ggml/src/ggml-aarch64.c +14 -14
- ggml/src/ggml-impl.h +1 -0
- ggml/src/ggml-quants.c +2 -2
- ggml/src/ggml-quants.h +4 -0
- ggml/src/ggml.c +9 -0
ggml/src/ggml-aarch64.c
CHANGED
|
@@ -386,8 +386,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 386 |
UNUSED(blocklen);
|
| 387 |
|
| 388 |
#if defined(__ARM_FEATURE_SVE)
|
| 389 |
-
if (
|
| 390 |
-
GGML_ASSERT(!(ggml_cpu_has_sve() && (
|
| 391 |
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
| 392 |
}
|
| 393 |
#endif
|
|
@@ -498,8 +498,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 498 |
UNUSED(blocklen);
|
| 499 |
|
| 500 |
#if defined(__ARM_FEATURE_SVE)
|
| 501 |
-
if (
|
| 502 |
-
GGML_ASSERT(!(ggml_cpu_has_sve() && (
|
| 503 |
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
| 504 |
}
|
| 505 |
#endif
|
|
@@ -616,7 +616,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 616 |
UNUSED(blocklen);
|
| 617 |
|
| 618 |
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
| 619 |
-
if (
|
| 620 |
const void * b_ptr = vx;
|
| 621 |
const void * a_ptr = vy;
|
| 622 |
float * res_ptr = s;
|
|
@@ -682,12 +682,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 682 |
return;
|
| 683 |
}
|
| 684 |
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
| 685 |
-
GGML_ASSERT((ggml_cpu_has_sve() && (
|
| 686 |
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
| 687 |
"performance");
|
| 688 |
}
|
| 689 |
else if (ggml_cpu_has_neon()) {
|
| 690 |
-
GGML_ASSERT(((ggml_cpu_has_sve() && (
|
| 691 |
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
| 692 |
"quantization format for optimal performance");
|
| 693 |
}
|
|
@@ -747,8 +747,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 747 |
UNUSED(blocklen);
|
| 748 |
|
| 749 |
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
| 750 |
-
if (
|
| 751 |
-
GGML_ASSERT(!(ggml_cpu_has_sve() && (
|
| 752 |
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
| 753 |
}
|
| 754 |
#endif
|
|
@@ -1268,8 +1268,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 1268 |
UNUSED(blocklen);
|
| 1269 |
|
| 1270 |
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
| 1271 |
-
if (
|
| 1272 |
-
GGML_ASSERT(!(ggml_cpu_has_sve() && (
|
| 1273 |
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
| 1274 |
}
|
| 1275 |
#endif
|
|
@@ -1730,7 +1730,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 1730 |
UNUSED(blocklen);
|
| 1731 |
|
| 1732 |
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
| 1733 |
-
if (
|
| 1734 |
const void * b_ptr = vx;
|
| 1735 |
const void * a_ptr = vy;
|
| 1736 |
float * res_ptr = s;
|
|
@@ -2141,12 +2141,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 2141 |
return;
|
| 2142 |
}
|
| 2143 |
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
| 2144 |
-
GGML_ASSERT((ggml_cpu_has_sve() && (
|
| 2145 |
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
| 2146 |
"performance");
|
| 2147 |
}
|
| 2148 |
else if (ggml_cpu_has_neon()) {
|
| 2149 |
-
GGML_ASSERT(((ggml_cpu_has_sve() && (
|
| 2150 |
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
| 2151 |
"quantization format for optimal performance");
|
| 2152 |
}
|
|
|
|
| 386 |
UNUSED(blocklen);
|
| 387 |
|
| 388 |
#if defined(__ARM_FEATURE_SVE)
|
| 389 |
+
if (ggml_sve_cnt_b == QK8_0) {
|
| 390 |
+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
| 391 |
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
| 392 |
}
|
| 393 |
#endif
|
|
|
|
| 498 |
UNUSED(blocklen);
|
| 499 |
|
| 500 |
#if defined(__ARM_FEATURE_SVE)
|
| 501 |
+
if (ggml_sve_cnt_b == QK8_0) {
|
| 502 |
+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
| 503 |
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
| 504 |
}
|
| 505 |
#endif
|
|
|
|
| 616 |
UNUSED(blocklen);
|
| 617 |
|
| 618 |
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
| 619 |
+
if (ggml_sve_cnt_b == QK8_0) {
|
| 620 |
const void * b_ptr = vx;
|
| 621 |
const void * a_ptr = vy;
|
| 622 |
float * res_ptr = s;
|
|
|
|
| 682 |
return;
|
| 683 |
}
|
| 684 |
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
| 685 |
+
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
| 686 |
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
| 687 |
"performance");
|
| 688 |
}
|
| 689 |
else if (ggml_cpu_has_neon()) {
|
| 690 |
+
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
|
| 691 |
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
| 692 |
"quantization format for optimal performance");
|
| 693 |
}
|
|
|
|
| 747 |
UNUSED(blocklen);
|
| 748 |
|
| 749 |
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
| 750 |
+
if (ggml_sve_cnt_b == QK8_0) {
|
| 751 |
+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
| 752 |
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
| 753 |
}
|
| 754 |
#endif
|
|
|
|
| 1268 |
UNUSED(blocklen);
|
| 1269 |
|
| 1270 |
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
| 1271 |
+
if (ggml_sve_cnt_b == QK8_0) {
|
| 1272 |
+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
| 1273 |
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
| 1274 |
}
|
| 1275 |
#endif
|
|
|
|
| 1730 |
UNUSED(blocklen);
|
| 1731 |
|
| 1732 |
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
| 1733 |
+
if (ggml_sve_cnt_b == QK8_0) {
|
| 1734 |
const void * b_ptr = vx;
|
| 1735 |
const void * a_ptr = vy;
|
| 1736 |
float * res_ptr = s;
|
|
|
|
| 2141 |
return;
|
| 2142 |
}
|
| 2143 |
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
| 2144 |
+
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
| 2145 |
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
| 2146 |
"performance");
|
| 2147 |
}
|
| 2148 |
else if (ggml_cpu_has_neon()) {
|
| 2149 |
+
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
|
| 2150 |
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
| 2151 |
"quantization format for optimal performance");
|
| 2152 |
}
|
ggml/src/ggml-impl.h
CHANGED
|
@@ -143,6 +143,7 @@ extern "C" {
|
|
| 143 |
|
| 144 |
#if defined(__ARM_FEATURE_SVE)
|
| 145 |
#include <arm_sve.h>
|
|
|
|
| 146 |
#endif
|
| 147 |
|
| 148 |
// 16-bit float
|
|
|
|
| 143 |
|
| 144 |
#if defined(__ARM_FEATURE_SVE)
|
| 145 |
#include <arm_sve.h>
|
| 146 |
+
#include <sys/prctl.h>
|
| 147 |
#endif
|
| 148 |
|
| 149 |
// 16-bit float
|
ggml/src/ggml-quants.c
CHANGED
|
@@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 3818 |
float sumf = 0;
|
| 3819 |
|
| 3820 |
#if defined(__ARM_FEATURE_SVE)
|
| 3821 |
-
if (
|
| 3822 |
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
| 3823 |
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
| 3824 |
|
|
@@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 5303 |
float sumf = 0;
|
| 5304 |
|
| 5305 |
#if defined(__ARM_FEATURE_SVE)
|
| 5306 |
-
if (
|
| 5307 |
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
| 5308 |
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
| 5309 |
|
|
|
|
| 3818 |
float sumf = 0;
|
| 3819 |
|
| 3820 |
#if defined(__ARM_FEATURE_SVE)
|
| 3821 |
+
if (ggml_sve_cnt_b == QK8_0) {
|
| 3822 |
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
| 3823 |
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
| 3824 |
|
|
|
|
| 5303 |
float sumf = 0;
|
| 5304 |
|
| 5305 |
#if defined(__ARM_FEATURE_SVE)
|
| 5306 |
+
if (ggml_sve_cnt_b == QK8_0) {
|
| 5307 |
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
| 5308 |
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
| 5309 |
|
ggml/src/ggml-quants.h
CHANGED
|
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum ggml_type type);
|
|
| 127 |
void iq3xs_init_impl(int grid_size);
|
| 128 |
void iq3xs_free_impl(int grid_size);
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
#ifdef __cplusplus
|
| 131 |
}
|
| 132 |
#endif
|
|
|
|
| 127 |
void iq3xs_init_impl(int grid_size);
|
| 128 |
void iq3xs_free_impl(int grid_size);
|
| 129 |
|
| 130 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 131 |
+
extern int ggml_sve_cnt_b;
|
| 132 |
+
#endif
|
| 133 |
+
|
| 134 |
#ifdef __cplusplus
|
| 135 |
}
|
| 136 |
#endif
|
ggml/src/ggml.c
CHANGED
|
@@ -37,6 +37,9 @@
|
|
| 37 |
#include <unistd.h>
|
| 38 |
#endif
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
| 41 |
#undef GGML_USE_LLAMAFILE
|
| 42 |
#endif
|
|
@@ -3561,6 +3564,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 3561 |
|
| 3562 |
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
|
| 3563 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3564 |
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
| 3565 |
|
| 3566 |
ggml_critical_section_end();
|
|
|
|
| 37 |
#include <unistd.h>
|
| 38 |
#endif
|
| 39 |
|
| 40 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 41 |
+
int ggml_sve_cnt_b = 0;
|
| 42 |
+
#endif
|
| 43 |
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
| 44 |
#undef GGML_USE_LLAMAFILE
|
| 45 |
#endif
|
|
|
|
| 3564 |
|
| 3565 |
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
|
| 3566 |
|
| 3567 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 3568 |
+
if (!ggml_sve_cnt_b) {
|
| 3569 |
+
ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
| 3570 |
+
}
|
| 3571 |
+
#endif
|
| 3572 |
+
|
| 3573 |
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
| 3574 |
|
| 3575 |
ggml_critical_section_end();
|