Kawrakow ikawrakow commited on
Commit
963ade6
·
unverified ·
1 Parent(s): 9bb2b0a

ggml : make use of ggml-quants.h possible in C++ code (llama/5338)

Browse files

* Make use of ggml-quants.h possible in C++ code

* One cannot possibly be defining static_assert in a C++ compilation

---------

Co-authored-by: Iwan Kawrakow <[email protected]>

Files changed (2) hide show
  1. ggml-impl.h +2 -0
  2. ggml-quants.h +68 -59
ggml-impl.h CHANGED
@@ -19,6 +19,7 @@ extern "C" {
19
  // fall back to the _Static_assert C11 keyword.
20
  // if C99 - static_assert is noop
21
  // ref: https://stackoverflow.com/a/53923785/4039976
 
22
  #ifndef static_assert
23
  #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
24
  #define static_assert(cond, msg) _Static_assert(cond, msg)
@@ -26,6 +27,7 @@ extern "C" {
26
  #define static_assert(cond, msg) struct global_scope_noop_trick
27
  #endif
28
  #endif
 
29
 
30
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
31
  #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 
19
  // fall back to the _Static_assert C11 keyword.
20
  // if C99 - static_assert is noop
21
  // ref: https://stackoverflow.com/a/53923785/4039976
22
+ #ifndef __cplusplus
23
  #ifndef static_assert
24
  #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
25
  #define static_assert(cond, msg) _Static_assert(cond, msg)
 
27
  #define static_assert(cond, msg) struct global_scope_noop_trick
28
  #endif
29
  #endif
30
+ #endif
31
 
32
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
33
  #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
ggml-quants.h CHANGED
@@ -191,70 +191,74 @@ typedef struct {
191
  } block_iq3_xxs;
192
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
193
 
 
 
 
 
194
  // Quantization
195
- void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
196
- void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
197
- void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
198
- void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
199
- void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
200
- void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
201
-
202
- void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
203
- void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
204
- void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
205
- void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
206
- void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
207
- void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
208
- void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k);
209
-
210
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
211
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
212
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
213
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
214
- void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
215
- void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
216
-
217
- void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
218
- void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
219
- void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
220
- void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
221
- void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
222
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
223
- void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k);
224
 
225
  // Dequantization
226
- void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
227
- void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
228
- void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
229
- void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
230
- void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
231
- //void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
232
-
233
- void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
234
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
235
- void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
236
- void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
237
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
238
- void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
239
- void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
240
- void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
241
- void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k);
242
 
243
  // Dot product
244
- void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
245
- void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
246
- void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
247
- void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
248
- void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
249
-
250
- void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
251
- void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
252
- void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
253
- void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
254
- void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
255
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
256
- void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
257
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
258
 
259
  //
260
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
@@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size);
276
  void iq2xs_free_impl(int grid_size);
277
  void iq3xs_init_impl(int grid_size);
278
  void iq3xs_free_impl(int grid_size);
 
 
 
 
 
 
191
  } block_iq3_xxs;
192
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
193
 
194
+ #ifdef __cplusplus
195
+ extern "C" {
196
+ #endif
197
+
198
  // Quantization
199
+ void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
200
+ void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
201
+ void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
202
+ void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
203
+ void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
204
+ void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
205
+
206
+ void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
207
+ void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
208
+ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
209
+ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
210
+ void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
211
+ void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
212
+ void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
213
+
214
+ void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
215
+ void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
216
+ void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
217
+ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
218
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
219
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
220
+
221
+ void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
222
+ void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
223
+ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
224
+ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
225
+ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
226
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
227
+ void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
228
 
229
  // Dequantization
230
+ void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
231
+ void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
232
+ void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
233
+ void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
234
+ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
235
+ //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
236
+
237
+ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
238
+ void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
239
+ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
240
+ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
241
+ void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
242
+ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
243
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
244
+ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
245
+ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
246
 
247
  // Dot product
248
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
249
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
250
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
251
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
252
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
253
+
254
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
255
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
256
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
257
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
258
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
259
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
260
+ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
261
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
262
 
263
  //
264
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 
280
  void iq2xs_free_impl(int grid_size);
281
  void iq3xs_init_impl(int grid_size);
282
  void iq3xs_free_impl(int grid_size);
283
+
284
+ #ifdef __cplusplus
285
+ }
286
+ #endif
287
+