Spaces:
Sleeping
Sleeping
android : use "ci-android" branch for CI (llama/7341)
Browse files* android : use "ci-android" branch for CI
* ggml : disable SIMD exp and silu for 32-bit ARM
ggml-ci
* android : do not fetch, use add_subdirectory instead
* cmake : provide binary dir
ggml.c
CHANGED
|
@@ -2076,7 +2076,7 @@ inline static float ggml_silu_f32(float x) {
|
|
| 2076 |
return x/(1.0f + expf(-x));
|
| 2077 |
}
|
| 2078 |
|
| 2079 |
-
#if defined(__ARM_NEON)
|
| 2080 |
|
| 2081 |
// adapted from arm limited optimized routine
|
| 2082 |
// the maximum error is 1.45358 plus 0.5 ulps
|
|
@@ -2288,7 +2288,7 @@ static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|
| 2288 |
for (; i + 3 < n; i += 4) {
|
| 2289 |
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
| 2290 |
}
|
| 2291 |
-
#elif defined(__ARM_NEON)
|
| 2292 |
for (; i + 3 < n; i += 4) {
|
| 2293 |
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
| 2294 |
}
|
|
@@ -2335,7 +2335,7 @@ static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x,
|
|
| 2335 |
#endif
|
| 2336 |
sum += (ggml_float)_mm_cvtss_f32(val);
|
| 2337 |
}
|
| 2338 |
-
#elif defined(__ARM_NEON)
|
| 2339 |
for (; i + 3 < n; i += 4) {
|
| 2340 |
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
| 2341 |
vdupq_n_f32(max)));
|
|
|
|
| 2076 |
return x/(1.0f + expf(-x));
|
| 2077 |
}
|
| 2078 |
|
| 2079 |
+
#if defined(__ARM_NEON) && defined(__aarch64__)
|
| 2080 |
|
| 2081 |
// adapted from arm limited optimized routine
|
| 2082 |
// the maximum error is 1.45358 plus 0.5 ulps
|
|
|
|
| 2288 |
for (; i + 3 < n; i += 4) {
|
| 2289 |
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
| 2290 |
}
|
| 2291 |
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
| 2292 |
for (; i + 3 < n; i += 4) {
|
| 2293 |
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
| 2294 |
}
|
|
|
|
| 2335 |
#endif
|
| 2336 |
sum += (ggml_float)_mm_cvtss_f32(val);
|
| 2337 |
}
|
| 2338 |
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
| 2339 |
for (; i + 3 < n; i += 4) {
|
| 2340 |
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
| 2341 |
vdupq_n_f32(max)));
|