Spaces:
Sleeping
Sleeping
llamafile : support s390x SIMD instruction set (llama/14273)
Browse files
ggml/src/ggml-cpu/llamafile/sgemm.cpp
CHANGED
|
@@ -62,7 +62,7 @@
|
|
| 62 |
#define NOINLINE __attribute__((__noinline__))
|
| 63 |
#endif
|
| 64 |
|
| 65 |
-
#if defined(__ARM_NEON) || defined(__AVX512F__)
|
| 66 |
#define VECTOR_REGISTERS 32
|
| 67 |
#else
|
| 68 |
#define VECTOR_REGISTERS 16
|
|
@@ -109,6 +109,12 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
|
|
| 109 |
inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
|
| 110 |
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
#if defined(__MMA__)
|
| 113 |
typedef vector unsigned char vec_t;
|
| 114 |
typedef __vector_quad acc_t;
|
|
@@ -162,6 +168,13 @@ inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
|
|
| 162 |
#endif
|
| 163 |
#endif
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
////////////////////////////////////////////////////////////////////////////////////////////////////
|
| 166 |
// VECTORIZED HORIZONTAL SUM
|
| 167 |
|
|
@@ -178,6 +191,13 @@ inline float hsum(float16x8_t x) {
|
|
| 178 |
}
|
| 179 |
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
| 182 |
inline float hsum(__m128 x) {
|
| 183 |
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
|
@@ -227,6 +247,21 @@ template <> inline float32x4_t load(const ggml_fp16_t *p) {
|
|
| 227 |
#endif // _MSC_VER
|
| 228 |
#endif // __ARM_NEON
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
| 231 |
template <> inline __m128 load(const float *p) {
|
| 232 |
return _mm_loadu_ps(p);
|
|
@@ -3319,6 +3354,14 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
|
| 3319 |
(const float *)B, ldb,
|
| 3320 |
(float *)C, ldc};
|
| 3321 |
return tb.matmul(m, n);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3322 |
#elif defined(__MMA__)
|
| 3323 |
if (k % 8)
|
| 3324 |
return false;
|
|
@@ -3410,6 +3453,16 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
|
| 3410 |
(float *)C, ldc};
|
| 3411 |
return tb.matmul(m, n);
|
| 3412 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3413 |
#endif
|
| 3414 |
return false;
|
| 3415 |
}
|
|
|
|
| 62 |
#define NOINLINE __attribute__((__noinline__))
|
| 63 |
#endif
|
| 64 |
|
| 65 |
+
#if defined(__ARM_NEON) || defined(__AVX512F__) || defined(__VXE__) || defined(__VXE2__)
|
| 66 |
#define VECTOR_REGISTERS 32
|
| 67 |
#else
|
| 68 |
#define VECTOR_REGISTERS 16
|
|
|
|
| 109 |
inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
|
| 110 |
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
| 111 |
|
| 112 |
+
#if defined(__VXE__) || defined(__VXE2__)
|
| 113 |
+
inline float32x4_t add(float32x4_t x, float32x4_t y) { return vec_add(x, y); }
|
| 114 |
+
inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vec_sub(x, y); }
|
| 115 |
+
inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
|
| 116 |
+
#endif
|
| 117 |
+
|
| 118 |
#if defined(__MMA__)
|
| 119 |
typedef vector unsigned char vec_t;
|
| 120 |
typedef __vector_quad acc_t;
|
|
|
|
| 168 |
#endif
|
| 169 |
#endif
|
| 170 |
|
| 171 |
+
#if defined(__VXE__) || defined(__VXE2__)
|
| 172 |
+
template <>
|
| 173 |
+
inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
|
| 174 |
+
return vec_madd(a, b, c);
|
| 175 |
+
}
|
| 176 |
+
#endif
|
| 177 |
+
|
| 178 |
////////////////////////////////////////////////////////////////////////////////////////////////////
|
| 179 |
// VECTORIZED HORIZONTAL SUM
|
| 180 |
|
|
|
|
| 191 |
}
|
| 192 |
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
| 193 |
|
| 194 |
+
#if defined(__VXE__) || defined(__VXE2__)
|
| 195 |
+
inline float hsum(float32x4_t x) {
|
| 196 |
+
float32x4_t tmp = x + vec_reve(x);
|
| 197 |
+
return tmp[0] + tmp[1];
|
| 198 |
+
}
|
| 199 |
+
#endif
|
| 200 |
+
|
| 201 |
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
| 202 |
inline float hsum(__m128 x) {
|
| 203 |
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
|
|
|
| 247 |
#endif // _MSC_VER
|
| 248 |
#endif // __ARM_NEON
|
| 249 |
|
| 250 |
+
#if defined(__VXE__) || defined(__VXE2__)
|
| 251 |
+
template <> inline float32x4_t load(const ggml_fp16_t * p) {
|
| 252 |
+
float tmp[4];
|
| 253 |
+
|
| 254 |
+
for (int i = 0; i < 4; i++) {
|
| 255 |
+
tmp[i] = GGML_FP16_TO_FP32(p[i]);
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
return vec_xl(0, (const float *)(tmp));
|
| 259 |
+
}
|
| 260 |
+
template <> inline float32x4_t load(const float * p) {
|
| 261 |
+
return vec_xl(0, p);
|
| 262 |
+
}
|
| 263 |
+
#endif
|
| 264 |
+
|
| 265 |
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
| 266 |
template <> inline __m128 load(const float *p) {
|
| 267 |
return _mm_loadu_ps(p);
|
|
|
|
| 3354 |
(const float *)B, ldb,
|
| 3355 |
(float *)C, ldc};
|
| 3356 |
return tb.matmul(m, n);
|
| 3357 |
+
#elif defined(__VXE__) || defined(__VXE2__)
|
| 3358 |
+
if (n < 4)
|
| 3359 |
+
return false;
|
| 3360 |
+
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
|
| 3361 |
+
k, (const float *)A, lda,
|
| 3362 |
+
(const float *)B, ldb,
|
| 3363 |
+
(float *)C, ldc};
|
| 3364 |
+
return tb.matmul(m, n);
|
| 3365 |
#elif defined(__MMA__)
|
| 3366 |
if (k % 8)
|
| 3367 |
return false;
|
|
|
|
| 3453 |
(float *)C, ldc};
|
| 3454 |
return tb.matmul(m, n);
|
| 3455 |
}
|
| 3456 |
+
#elif defined(__VXE__) || defined(__VXE2__)
|
| 3457 |
+
if (n < 4)
|
| 3458 |
+
return false;
|
| 3459 |
+
if (Btype == GGML_TYPE_F16) {
|
| 3460 |
+
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
|
| 3461 |
+
k, (const ggml_fp16_t *)A, lda,
|
| 3462 |
+
(const ggml_fp16_t *)B, ldb,
|
| 3463 |
+
(float *)C, ldc};
|
| 3464 |
+
return tb.matmul(m, n);
|
| 3465 |
+
}
|
| 3466 |
#endif
|
| 3467 |
return false;
|
| 3468 |
}
|
ggml/src/ggml-cpu/llamafile/sgemm.h
CHANGED
|
@@ -1,6 +1,11 @@
|
|
| 1 |
#pragma once
|
| 2 |
#include <stdint.h>
|
| 3 |
#include <stdbool.h>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
#ifdef __cplusplus
|
| 5 |
extern "C" {
|
| 6 |
#endif
|
|
|
|
| 1 |
#pragma once
|
| 2 |
#include <stdint.h>
|
| 3 |
#include <stdbool.h>
|
| 4 |
+
|
| 5 |
+
#if defined(__VXE__) || defined(__VXE2__)
|
| 6 |
+
#include <vecintrin.h>
|
| 7 |
+
#endif
|
| 8 |
+
|
| 9 |
#ifdef __cplusplus
|
| 10 |
extern "C" {
|
| 11 |
#endif
|