jdomke domke commited on
Commit
c26339f
·
1 Parent(s): 8b10f59

ggml : reading the runtime sve config of the cpu (llama/8709)

Browse files

* ggml : reading the runtime sve config of the cpu

* change to one time init to prevent performance drop

* prefix variable to avoid possible conflicts

* revert xxhash fix and add brackets

---------

Co-authored-by: domke <[email protected]>

ggml/src/ggml-aarch64.c CHANGED
@@ -386,8 +386,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
386
  UNUSED(blocklen);
387
 
388
  #if defined(__ARM_FEATURE_SVE)
389
- if (svcntw() == 8) {
390
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
391
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
392
  }
393
  #endif
@@ -498,8 +498,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
498
  UNUSED(blocklen);
499
 
500
  #if defined(__ARM_FEATURE_SVE)
501
- if (svcntw() == 8) {
502
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
503
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
504
  }
505
  #endif
@@ -616,7 +616,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
616
  UNUSED(blocklen);
617
 
618
  #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
619
- if (svcntw() == 8) {
620
  const void * b_ptr = vx;
621
  const void * a_ptr = vy;
622
  float * res_ptr = s;
@@ -682,12 +682,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
682
  return;
683
  }
684
  else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
685
- GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
686
  "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
687
  "performance");
688
  }
689
  else if (ggml_cpu_has_neon()) {
690
- GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
691
  "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
692
  "quantization format for optimal performance");
693
  }
@@ -747,8 +747,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
747
  UNUSED(blocklen);
748
 
749
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
750
- if (svcntw() == 8) {
751
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
752
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
753
  }
754
  #endif
@@ -1268,8 +1268,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
1268
  UNUSED(blocklen);
1269
 
1270
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1271
- if (svcntw() == 8) {
1272
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
1273
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
1274
  }
1275
  #endif
@@ -1730,7 +1730,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
1730
  UNUSED(blocklen);
1731
 
1732
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1733
- if (svcntw() == 8) {
1734
  const void * b_ptr = vx;
1735
  const void * a_ptr = vy;
1736
  float * res_ptr = s;
@@ -2141,12 +2141,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
2141
  return;
2142
  }
2143
  else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2144
- GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
2145
  "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
2146
  "performance");
2147
  }
2148
  else if (ggml_cpu_has_neon()) {
2149
- GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
2150
  "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
2151
  "quantization format for optimal performance");
2152
  }
 
386
  UNUSED(blocklen);
387
 
388
  #if defined(__ARM_FEATURE_SVE)
389
+ if (ggml_sve_cnt_b == QK8_0) {
390
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
391
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
392
  }
393
  #endif
 
498
  UNUSED(blocklen);
499
 
500
  #if defined(__ARM_FEATURE_SVE)
501
+ if (ggml_sve_cnt_b == QK8_0) {
502
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
503
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
504
  }
505
  #endif
 
616
  UNUSED(blocklen);
617
 
618
  #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
619
+ if (ggml_sve_cnt_b == QK8_0) {
620
  const void * b_ptr = vx;
621
  const void * a_ptr = vy;
622
  float * res_ptr = s;
 
682
  return;
683
  }
684
  else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
685
+ GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
686
  "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
687
  "performance");
688
  }
689
  else if (ggml_cpu_has_neon()) {
690
+ GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
691
  "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
692
  "quantization format for optimal performance");
693
  }
 
747
  UNUSED(blocklen);
748
 
749
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
750
+ if (ggml_sve_cnt_b == QK8_0) {
751
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
752
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
753
  }
754
  #endif
 
1268
  UNUSED(blocklen);
1269
 
1270
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1271
+ if (ggml_sve_cnt_b == QK8_0) {
1272
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
1273
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
1274
  }
1275
  #endif
 
1730
  UNUSED(blocklen);
1731
 
1732
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1733
+ if (ggml_sve_cnt_b == QK8_0) {
1734
  const void * b_ptr = vx;
1735
  const void * a_ptr = vy;
1736
  float * res_ptr = s;
 
2141
  return;
2142
  }
2143
  else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2144
+ GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
2145
  "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
2146
  "performance");
2147
  }
2148
  else if (ggml_cpu_has_neon()) {
2149
+ GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
2150
  "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
2151
  "quantization format for optimal performance");
2152
  }
ggml/src/ggml-impl.h CHANGED
@@ -143,6 +143,7 @@ extern "C" {
143
 
144
  #if defined(__ARM_FEATURE_SVE)
145
  #include <arm_sve.h>
 
146
  #endif
147
 
148
  // 16-bit float
 
143
 
144
  #if defined(__ARM_FEATURE_SVE)
145
  #include <arm_sve.h>
146
+ #include <sys/prctl.h>
147
  #endif
148
 
149
  // 16-bit float
ggml/src/ggml-quants.c CHANGED
@@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3818
  float sumf = 0;
3819
 
3820
  #if defined(__ARM_FEATURE_SVE)
3821
- if (svcntb() == QK8_0) {
3822
  const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
3823
  const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
3824
 
@@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
5303
  float sumf = 0;
5304
 
5305
  #if defined(__ARM_FEATURE_SVE)
5306
- if (svcntb() == QK8_0) {
5307
  svfloat32_t sumv0 = svdup_n_f32(0.0f);
5308
  svfloat32_t sumv1 = svdup_n_f32(0.0f);
5309
 
 
3818
  float sumf = 0;
3819
 
3820
  #if defined(__ARM_FEATURE_SVE)
3821
+ if (ggml_sve_cnt_b == QK8_0) {
3822
  const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
3823
  const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
3824
 
 
5303
  float sumf = 0;
5304
 
5305
  #if defined(__ARM_FEATURE_SVE)
5306
+ if (ggml_sve_cnt_b == QK8_0) {
5307
  svfloat32_t sumv0 = svdup_n_f32(0.0f);
5308
  svfloat32_t sumv1 = svdup_n_f32(0.0f);
5309
 
ggml/src/ggml-quants.h CHANGED
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum ggml_type type);
127
  void iq3xs_init_impl(int grid_size);
128
  void iq3xs_free_impl(int grid_size);
129
 
 
 
 
 
130
  #ifdef __cplusplus
131
  }
132
  #endif
 
127
  void iq3xs_init_impl(int grid_size);
128
  void iq3xs_free_impl(int grid_size);
129
 
130
+ #if defined(__ARM_FEATURE_SVE)
131
+ extern int ggml_sve_cnt_b;
132
+ #endif
133
+
134
  #ifdef __cplusplus
135
  }
136
  #endif
ggml/src/ggml.c CHANGED
@@ -37,6 +37,9 @@
37
  #include <unistd.h>
38
  #endif
39
 
 
 
 
40
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
41
  #undef GGML_USE_LLAMAFILE
42
  #endif
@@ -3561,6 +3564,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3561
 
3562
  GGML_ASSERT_ALIGNED(ctx->mem_buffer);
3563
 
 
 
 
 
 
 
3564
  GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3565
 
3566
  ggml_critical_section_end();
 
37
  #include <unistd.h>
38
  #endif
39
 
40
+ #if defined(__ARM_FEATURE_SVE)
41
+ int ggml_sve_cnt_b = 0;
42
+ #endif
43
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
44
  #undef GGML_USE_LLAMAFILE
45
  #endif
 
3564
 
3565
  GGML_ASSERT_ALIGNED(ctx->mem_buffer);
3566
 
3567
+ #if defined(__ARM_FEATURE_SVE)
3568
+ if (!ggml_sve_cnt_b) {
3569
+ ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3570
+ }
3571
+ #endif
3572
+
3573
  GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3574
 
3575
  ggml_critical_section_end();