Spaces:
Running
Running
ggml : make use of ggml-quants.h possible in C++ code (llama/5338)
Browse files* Make use of ggml-quants.h possible in C++ code
* One cannot possibly be defining static_assert in a C++ compilation
---------
Co-authored-by: Iwan Kawrakow <[email protected]>
- ggml-impl.h +2 -0
- ggml-quants.h +68 -59
ggml-impl.h
CHANGED
|
@@ -19,6 +19,7 @@ extern "C" {
|
|
| 19 |
// fall back to the _Static_assert C11 keyword.
|
| 20 |
// if C99 - static_assert is noop
|
| 21 |
// ref: https://stackoverflow.com/a/53923785/4039976
|
|
|
|
| 22 |
#ifndef static_assert
|
| 23 |
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
| 24 |
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
|
@@ -26,6 +27,7 @@ extern "C" {
|
|
| 26 |
#define static_assert(cond, msg) struct global_scope_noop_trick
|
| 27 |
#endif
|
| 28 |
#endif
|
|
|
|
| 29 |
|
| 30 |
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
| 31 |
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
|
|
|
| 19 |
// fall back to the _Static_assert C11 keyword.
|
| 20 |
// if C99 - static_assert is noop
|
| 21 |
// ref: https://stackoverflow.com/a/53923785/4039976
|
| 22 |
+
#ifndef __cplusplus
|
| 23 |
#ifndef static_assert
|
| 24 |
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
| 25 |
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
|
|
|
| 27 |
#define static_assert(cond, msg) struct global_scope_noop_trick
|
| 28 |
#endif
|
| 29 |
#endif
|
| 30 |
+
#endif
|
| 31 |
|
| 32 |
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
| 33 |
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
ggml-quants.h
CHANGED
|
@@ -191,70 +191,74 @@ typedef struct {
|
|
| 191 |
} block_iq3_xxs;
|
| 192 |
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
// Quantization
|
| 195 |
-
void quantize_row_q4_0_reference(const float *
|
| 196 |
-
void quantize_row_q4_1_reference(const float *
|
| 197 |
-
void quantize_row_q5_0_reference(const float *
|
| 198 |
-
void quantize_row_q5_1_reference(const float *
|
| 199 |
-
void quantize_row_q8_0_reference(const float *
|
| 200 |
-
void quantize_row_q8_1_reference(const float *
|
| 201 |
-
|
| 202 |
-
void quantize_row_q2_K_reference(const float *
|
| 203 |
-
void quantize_row_q3_K_reference(const float *
|
| 204 |
-
void quantize_row_q4_K_reference(const float *
|
| 205 |
-
void quantize_row_q5_K_reference(const float *
|
| 206 |
-
void quantize_row_q6_K_reference(const float *
|
| 207 |
-
void quantize_row_q8_K_reference(const float *
|
| 208 |
-
void quantize_row_iq3_xxs_reference(const float *
|
| 209 |
-
|
| 210 |
-
void quantize_row_q4_0(const float *
|
| 211 |
-
void quantize_row_q4_1(const float *
|
| 212 |
-
void quantize_row_q5_0(const float *
|
| 213 |
-
void quantize_row_q5_1(const float *
|
| 214 |
-
void quantize_row_q8_0(const float *
|
| 215 |
-
void quantize_row_q8_1(const float *
|
| 216 |
-
|
| 217 |
-
void quantize_row_q2_K(const float *
|
| 218 |
-
void quantize_row_q3_K(const float *
|
| 219 |
-
void quantize_row_q4_K(const float *
|
| 220 |
-
void quantize_row_q5_K(const float *
|
| 221 |
-
void quantize_row_q6_K(const float *
|
| 222 |
-
void quantize_row_q8_K(const float *
|
| 223 |
-
void quantize_row_iq3_xxs(const float *
|
| 224 |
|
| 225 |
// Dequantization
|
| 226 |
-
void dequantize_row_q4_0(const block_q4_0 *
|
| 227 |
-
void dequantize_row_q4_1(const block_q4_1 *
|
| 228 |
-
void dequantize_row_q5_0(const block_q5_0 *
|
| 229 |
-
void dequantize_row_q5_1(const block_q5_1 *
|
| 230 |
-
void dequantize_row_q8_0(const block_q8_0 *
|
| 231 |
-
//void dequantize_row_q8_1(const block_q8_1 *
|
| 232 |
-
|
| 233 |
-
void dequantize_row_q2_K(const block_q2_K *
|
| 234 |
-
void dequantize_row_q3_K(const block_q3_K *
|
| 235 |
-
void dequantize_row_q4_K(const block_q4_K *
|
| 236 |
-
void dequantize_row_q5_K(const block_q5_K *
|
| 237 |
-
void dequantize_row_q6_K(const block_q6_K *
|
| 238 |
-
void dequantize_row_q8_K(const block_q8_K *
|
| 239 |
-
void dequantize_row_iq2_xxs(const block_iq2_xxs *
|
| 240 |
-
void dequantize_row_iq2_xs (const block_iq2_xs *
|
| 241 |
-
void dequantize_row_iq3_xxs(const block_iq3_xxs *
|
| 242 |
|
| 243 |
// Dot product
|
| 244 |
-
void ggml_vec_dot_q4_0_q8_0(int n, float *
|
| 245 |
-
void ggml_vec_dot_q4_1_q8_1(int n, float *
|
| 246 |
-
void ggml_vec_dot_q5_0_q8_0(int n, float *
|
| 247 |
-
void ggml_vec_dot_q5_1_q8_1(int n, float *
|
| 248 |
-
void ggml_vec_dot_q8_0_q8_0(int n, float *
|
| 249 |
-
|
| 250 |
-
void ggml_vec_dot_q2_K_q8_K(int n, float *
|
| 251 |
-
void ggml_vec_dot_q3_K_q8_K(int n, float *
|
| 252 |
-
void ggml_vec_dot_q4_K_q8_K(int n, float *
|
| 253 |
-
void ggml_vec_dot_q5_K_q8_K(int n, float *
|
| 254 |
-
void ggml_vec_dot_q6_K_q8_K(int n, float *
|
| 255 |
-
void ggml_vec_dot_iq2_xxs_q8_K(int n, float *
|
| 256 |
-
void ggml_vec_dot_iq2_xs_q8_K (int n, float *
|
| 257 |
-
void ggml_vec_dot_iq3_xxs_q8_K(int n, float *
|
| 258 |
|
| 259 |
//
|
| 260 |
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
|
@@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size);
|
|
| 276 |
void iq2xs_free_impl(int grid_size);
|
| 277 |
void iq3xs_init_impl(int grid_size);
|
| 278 |
void iq3xs_free_impl(int grid_size);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
} block_iq3_xxs;
|
| 192 |
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
| 193 |
|
| 194 |
+
#ifdef __cplusplus
|
| 195 |
+
extern "C" {
|
| 196 |
+
#endif
|
| 197 |
+
|
| 198 |
// Quantization
|
| 199 |
+
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
|
| 200 |
+
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
|
| 201 |
+
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
|
| 202 |
+
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
|
| 203 |
+
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
|
| 204 |
+
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
|
| 205 |
+
|
| 206 |
+
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
|
| 207 |
+
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
|
| 208 |
+
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
|
| 209 |
+
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
| 210 |
+
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
| 211 |
+
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
| 212 |
+
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
| 213 |
+
|
| 214 |
+
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 215 |
+
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 216 |
+
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 217 |
+
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 218 |
+
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 219 |
+
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 220 |
+
|
| 221 |
+
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 222 |
+
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 223 |
+
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 224 |
+
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 225 |
+
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 226 |
+
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 227 |
+
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 228 |
|
| 229 |
// Dequantization
|
| 230 |
+
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 231 |
+
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 232 |
+
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 233 |
+
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 234 |
+
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 235 |
+
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 236 |
+
|
| 237 |
+
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 238 |
+
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 239 |
+
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 240 |
+
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 241 |
+
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 242 |
+
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 243 |
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 244 |
+
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 245 |
+
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 246 |
|
| 247 |
// Dot product
|
| 248 |
+
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 249 |
+
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 250 |
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 251 |
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 252 |
+
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 253 |
+
|
| 254 |
+
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 255 |
+
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 256 |
+
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 257 |
+
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 258 |
+
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 259 |
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 260 |
+
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 261 |
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
| 262 |
|
| 263 |
//
|
| 264 |
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
|
|
|
| 280 |
void iq2xs_free_impl(int grid_size);
|
| 281 |
void iq3xs_init_impl(int grid_size);
|
| 282 |
void iq3xs_free_impl(int grid_size);
|
| 283 |
+
|
| 284 |
+
#ifdef __cplusplus
|
| 285 |
+
}
|
| 286 |
+
#endif
|
| 287 |
+
|