ggerganov commited on
Commit
6ee8740
·
unverified ·
1 Parent(s): 3b6a58b

ggml : sync latest ggml repo

Browse files

- new Q4 and Q8 quantization
- updated CUDA

Files changed (5) hide show
  1. examples/common.cpp +2 -1
  2. ggml-cuda.cu +159 -132
  3. ggml-cuda.h +2 -0
  4. ggml.c +322 -155
  5. ggml.h +13 -3
examples/common.cpp CHANGED
@@ -26,7 +26,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
26
  } else if (arg == "-n" || arg == "--n_predict") {
27
  params.n_predict = std::stoi(argv[++i]);
28
  } else if (arg == "--top_k") {
29
- params.top_k = std::stoi(argv[++i]);
30
  } else if (arg == "--top_p") {
31
  params.top_p = std::stof(argv[++i]);
32
  } else if (arg == "--temp") {
@@ -259,6 +259,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
259
  if (it != vocab.token_to_id.end()) {
260
  tokens.push_back(it->second);
261
  i = j;
 
262
  break;
263
  }
264
  --j;
 
26
  } else if (arg == "-n" || arg == "--n_predict") {
27
  params.n_predict = std::stoi(argv[++i]);
28
  } else if (arg == "--top_k") {
29
+ params.top_k = std::max(1, std::stoi(argv[++i]));
30
  } else if (arg == "--top_p") {
31
  params.top_p = std::stof(argv[++i]);
32
  } else if (arg == "--temp") {
 
259
  if (it != vocab.token_to_id.end()) {
260
  tokens.push_back(it->second);
261
  i = j;
262
+ j = n;
263
  break;
264
  }
265
  --j;
ggml-cuda.cu CHANGED
@@ -42,19 +42,19 @@ typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y,
42
  #define QK4_0 32
43
  #define QR4_0 2
44
  typedef struct {
45
- float d; // delta
46
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
47
  } block_q4_0;
48
- static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
49
 
50
  #define QK4_1 32
51
  #define QR4_1 2
52
  typedef struct {
53
- float d; // delta
54
- float m; // min
55
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
56
  } block_q4_1;
57
- static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
58
 
59
  #define QK5_0 32
60
  #define QR5_0 2
@@ -78,12 +78,23 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
78
  #define QK8_0 32
79
  #define QR8_0 1
80
  typedef struct {
81
- float d; // delta
82
  int8_t qs[QK8_0]; // quants
83
  } block_q8_0;
84
- static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
85
 
86
- #define CUDA_DMMV_BLOCK_SIZE 32
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
89
  const block_q4_0 * x = (const block_q4_0 *) vx;
@@ -170,104 +181,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
170
  v1 = __half2float(x[ib + 1]);
171
  }
172
 
173
- static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
174
- static const int qk = QK4_0;
175
-
176
- const block_q4_0 * x = (const block_q4_0 *) vx;
177
-
178
- const int i = blockIdx.x;
179
-
180
- const float d = x[i].d;
181
-
182
- for (int j = 0; j < qk/2; ++j) {
183
- const int x0 = (x[i].qs[j] & 0xf) - 8;
184
- const int x1 = (x[i].qs[j] >> 4) - 8;
185
-
186
- y[i*qk + j + 0 ] = x0*d;
187
- y[i*qk + j + qk/2] = x1*d;
188
- }
189
- }
190
-
191
- static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
192
- static const int qk = QK4_1;
193
 
194
- const block_q4_1 * x = (const block_q4_1 *) vx;
195
-
196
- const int i = blockIdx.x;
197
-
198
- const float d = x[i].d;
199
- const float m = x[i].m;
200
-
201
- for (int j = 0; j < qk/2; ++j) {
202
- const int x0 = (x[i].qs[j] & 0xf);
203
- const int x1 = (x[i].qs[j] >> 4);
204
-
205
- y[i*qk + j + 0 ] = x0*d + m;
206
- y[i*qk + j + qk/2] = x1*d + m;
207
  }
208
- }
209
-
210
- static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
211
- static const int qk = QK5_0;
212
-
213
- const block_q5_0 * x = (const block_q5_0 *) vx;
214
-
215
- const int i = blockIdx.x;
216
-
217
- const float d = x[i].d;
218
-
219
- uint32_t qh;
220
- memcpy(&qh, x[i].qh, sizeof(qh));
221
 
222
- for (int j = 0; j < qk/2; ++j) {
223
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
224
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
225
-
226
- const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
227
- const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
228
-
229
- y[i*qk + j + 0 ] = x0*d;
230
- y[i*qk + j + qk/2] = x1*d;
231
- }
232
- }
233
-
234
- static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
235
- static const int qk = QK5_1;
236
-
237
- const block_q5_1 * x = (const block_q5_1 *) vx;
238
-
239
- const int i = blockIdx.x;
240
-
241
- const float d = x[i].d;
242
- const float m = x[i].m;
243
-
244
- uint32_t qh;
245
- memcpy(&qh, x[i].qh, sizeof(qh));
246
-
247
- for (int j = 0; j < qk/2; ++j) {
248
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
249
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
250
-
251
- const int x0 = (x[i].qs[j] & 0xf) | xh_0;
252
- const int x1 = (x[i].qs[j] >> 4) | xh_1;
253
-
254
- y[i*qk + j + 0 ] = x0*d + m;
255
- y[i*qk + j + qk/2] = x1*d + m;
256
- }
257
- }
258
-
259
- static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
260
- static const int qk = QK8_0;
261
-
262
- const block_q8_0 * x = (const block_q8_0 *) vx;
263
-
264
- const int i = blockIdx.x;
265
-
266
- const float d = x[i].d;
267
 
268
- for (int j = 0; j < qk; ++j) {
269
- y[i*qk + j] = x[i].qs[j]*d;
270
- }
 
271
  }
272
 
273
  template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -308,29 +238,34 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
308
  }
309
  }
310
 
311
- static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
312
- const int nb = k / QK4_0;
313
- dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
 
 
 
 
 
314
  }
315
 
316
- static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
317
- const int nb = k / QK4_1;
318
- dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
319
  }
320
 
321
- static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
322
- const int nb = k / QK5_0;
323
- dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
324
  }
325
 
326
- static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
327
- const int nb = k / QK5_1;
328
- dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
329
  }
330
 
331
- static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
332
- const int nb = k / QK8_0;
333
- dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
334
  }
335
 
336
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@@ -363,17 +298,9 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
363
  <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
364
  }
365
 
366
- // TODO: optimize
367
- static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
368
- const half * x = (const half *) vx;
369
-
370
- const int i = blockIdx.x;
371
-
372
- y[i] = __half2float(x[i]);
373
- }
374
-
375
- static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) {
376
- convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
377
  }
378
 
379
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@@ -555,6 +482,67 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor
555
  }
556
  }
557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
  static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
559
  const int64_t ne00 = src0->ne[0];
560
  const int64_t ne01 = src0->ne[1];
@@ -812,6 +800,11 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
812
  ggml_cuda_pool_free(d_Q, q_size);
813
  }
814
 
 
 
 
 
 
815
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
816
  const int64_t ne10 = src1->ne[0];
817
 
@@ -885,14 +878,48 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
885
  const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
886
 
887
  size_t q_size;
888
- char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
889
 
890
  cudaStream_t cudaStream2 = g_cudaStreams2[0];
891
 
892
  // copy tensor to device
893
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
894
- CUDA_CHECK(cudaDeviceSynchronize());
 
 
 
 
895
 
896
- tensor->data = d_Q;
897
  tensor->backend = GGML_BACKEND_CUDA;
898
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  #define QK4_0 32
43
  #define QR4_0 2
44
  typedef struct {
45
+ half d; // delta
46
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
47
  } block_q4_0;
48
+ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
49
 
50
  #define QK4_1 32
51
  #define QR4_1 2
52
  typedef struct {
53
+ half d; // delta
54
+ half m; // min
55
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
56
  } block_q4_1;
57
+ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
58
 
59
  #define QK5_0 32
60
  #define QR5_0 2
 
78
  #define QK8_0 32
79
  #define QR8_0 1
80
  typedef struct {
81
+ half d; // delta
82
  int8_t qs[QK8_0]; // quants
83
  } block_q8_0;
84
+ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
85
 
86
+ #define CUDA_MUL_BLOCK_SIZE 256
87
+ #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
88
+ #define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec
89
+
90
+ static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
91
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
92
+
93
+ if (i >= kx) {
94
+ return;
95
+ }
96
+ dst[i] = x[i] * y[i%ky];
97
+ }
98
 
99
  static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
100
  const block_q4_0 * x = (const block_q4_0 *) vx;
 
181
  v1 = __half2float(x[ib + 1]);
182
  }
183
 
184
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
185
+ static __global__ void dequantize_block(const void * vx, float * y, const int k) {
186
+ const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+ if (i >= k) {
189
+ return;
 
 
 
 
 
 
 
 
 
 
 
190
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ const int ib = i/qk; // block index
193
+ const int iqs = (i%qk)/qr; // quant index
194
+ const int iybs = i - i%qk; // y block start index
195
+ const int y_offset = qr == 1 ? 1 : qk/2;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
+ // dequantize
198
+ float & v0 = y[iybs + iqs + 0];
199
+ float & v1 = y[iybs + iqs + y_offset];
200
+ dequantize_kernel(vx, ib, iqs, v0, v1);
201
  }
202
 
203
  template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
 
238
  }
239
  }
240
 
241
+ static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
242
+ const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
243
+ mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
244
+ }
245
+
246
+ static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
247
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
248
+ dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
249
  }
250
 
251
+ static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
252
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
253
+ dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
254
  }
255
 
256
+ static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
257
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
258
+ dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
259
  }
260
 
261
+ static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
262
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
263
+ dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
264
  }
265
 
266
+ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
267
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
268
+ dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
269
  }
270
 
271
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 
298
  <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
299
  }
300
 
301
+ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
302
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
303
+ dequantize_block<32, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 
 
 
 
 
 
 
 
304
  }
305
 
306
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 
482
  }
483
  }
484
 
485
+ static void ggml_cuda_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
486
+ GGML_ASSERT(src1->backend == GGML_BACKEND_CUDA);
487
+ const int64_t ne00 = src0->ne[0];
488
+ const int64_t ne01 = src0->ne[1];
489
+ const int64_t ne02 = src0->ne[2];
490
+ const int64_t ne03 = src0->ne[2];
491
+ const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
492
+ const int64_t ne10 = src1->ne[0];
493
+ const int64_t ne11 = src1->ne[1];
494
+ const int64_t ne12 = src1->ne[2];
495
+ const int64_t ne13 = src1->ne[3];
496
+ const int nb2 = dst->nb[2];
497
+ const int nb3 = dst->nb[3];
498
+ size_t x_size, d_size;
499
+
500
+ float * d_X = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &x_size); // src0
501
+ float * d_Y = (float *) src1->data; // src1 is already on device, broadcasted.
502
+ float * d_D = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &d_size); // dst
503
+
504
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
505
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
506
+ const int i0 = i03*ne02 + i02;
507
+ float * c_X2 = d_X + i0*ne01*ne00;
508
+ float * c_D2 = d_D + i0*ne01*ne00;
509
+
510
+ cudaStream_t cudaStream = g_cudaStreams[i0 % GGML_CUDA_MAX_STREAMS];
511
+ cudaStream_t cudaStream2 = g_cudaStreams2[i0 % GGML_CUDA_MAX_STREAMS];
512
+ cudaEvent_t cudaEvent = g_cudaEvents[i0 % GGML_CUDA_MAX_EVENTS];
513
+
514
+ // copy src0 to device
515
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X2, src0, i03, i02, cudaStream2));
516
+ CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
517
+
518
+ // wait for data
519
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
520
+
521
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
522
+ const int64_t i13 = i03%ne13;
523
+ const int64_t i12 = i02%ne12;
524
+ const int64_t i11 = i01%ne11;
525
+ const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
526
+
527
+ float * c_X1 = c_X2 + i01*ne00;
528
+ float * c_Y = d_Y + i1*ne10;
529
+ float * c_D1 = c_D2 + i01*ne00;
530
+
531
+ // compute
532
+ mul_f32_cuda(c_X1, c_Y, c_D1, ne00, ne10, cudaStream);
533
+ CUDA_CHECK(cudaGetLastError());
534
+ }
535
+
536
+ // copy dst to host
537
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
538
+ CUDA_CHECK(cudaMemcpyAsync(d, c_D2, sizeof(float)*ne00*ne01, cudaMemcpyDeviceToHost, cudaStream));
539
+ }
540
+ }
541
+ CUDA_CHECK(cudaDeviceSynchronize());
542
+ ggml_cuda_pool_free(d_X, x_size);
543
+ ggml_cuda_pool_free(d_D, d_size);
544
+ }
545
+
546
  static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
547
  const int64_t ne00 = src0->ne[0];
548
  const int64_t ne01 = src0->ne[1];
 
800
  ggml_cuda_pool_free(d_Q, q_size);
801
  }
802
 
803
+ void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
804
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
805
+ ggml_cuda_mul_f32(src0, src1, dst);
806
+ }
807
+
808
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
809
  const int64_t ne10 = src1->ne[0];
810
 
 
878
  const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
879
 
880
  size_t q_size;
881
+ char * dst = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
882
 
883
  cudaStream_t cudaStream2 = g_cudaStreams2[0];
884
 
885
  // copy tensor to device
886
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
887
+ for (int64_t i2 = 0; i2 < ne2; i2++) {
888
+ int i = i3*ne2 + i2;
889
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(dst + i*ne0*ne1, tensor, i3, i2, cudaStream2));
890
+ }
891
+ }
892
 
893
+ tensor->data = dst;
894
  tensor->backend = GGML_BACKEND_CUDA;
895
  }
896
+
897
+ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
898
+ FILE * fp = fopen(fname, "rb");
899
+
900
+ const size_t size = ggml_nbytes(tensor);
901
+
902
+ void * buf;
903
+ CUDA_CHECK(cudaMalloc(&buf, size));
904
+ void * buf_host = malloc(size);
905
+
906
+ #ifdef _WIN32
907
+ int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
908
+ #else
909
+ int ret = fseek(fp, (long) offset, SEEK_SET);
910
+ #endif
911
+ GGML_ASSERT(ret == 0); // same
912
+
913
+ size_t ret2 = fread(buf_host, size, 1, fp);
914
+ if (ret2 != 1) {
915
+ fprintf(stderr, "unexpectedly reached end of file");
916
+ exit(1);
917
+ }
918
+
919
+ cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
920
+ cudaDeviceSynchronize();
921
+
922
+ tensor->data = buf;
923
+ free(buf_host);
924
+ fclose(fp);
925
+ }
ggml-cuda.h CHANGED
@@ -6,6 +6,7 @@ extern "C" {
6
 
7
  void ggml_init_cublas(void);
8
 
 
9
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
10
  size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
  void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
@@ -15,6 +16,7 @@ void * ggml_cuda_host_malloc(size_t size);
15
  void ggml_cuda_host_free(void * ptr);
16
 
17
  void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
 
18
 
19
  #ifdef __cplusplus
20
  }
 
6
 
7
  void ggml_init_cublas(void);
8
 
9
+ void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
10
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
  size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
  void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
16
  void ggml_cuda_host_free(void * ptr);
17
 
18
  void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
19
+ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
20
 
21
  #ifdef __cplusplus
22
  }
ggml.c CHANGED
@@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
512
  return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
513
  }
514
 
515
- #if __AVX2__ || __AVX512F__
516
  // spread 32 bits to 32 bytes { 0x00, 0xFF }
517
  static inline __m256i bytes_from_bits_32(const uint8_t * x) {
518
  uint32_t x32;
@@ -543,12 +543,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
543
  return _mm256_cvtepi32_ps(summed_pairs);
544
  }
545
 
546
- // multiply int8_t, add results pairwise twice and return as float vector
547
- static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
548
- // Get absolute values of x vectors
549
- const __m256i ax = _mm256_sign_epi8(x, x);
550
- // Sign the values of the y vectors
551
- const __m256i sy = _mm256_sign_epi8(y, x);
552
  #if __AVXVNNI__
553
  const __m256i zero = _mm256_setzero_si256();
554
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
@@ -560,6 +555,21 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
560
  #endif
561
  }
562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  static inline __m128i packNibbles( __m256i bytes )
564
  {
565
  // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
@@ -619,6 +629,17 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
619
  return _mm256_cvtepi32_ps(summed_pairs);
620
  }
621
 
 
 
 
 
 
 
 
 
 
 
 
622
  // multiply int8_t, add results pairwise twice and return as float vector
623
  static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
624
  const __m128i xl = _mm256_castsi256_si128(x);
@@ -667,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
667
  #endif // __AVX__ || __AVX2__ || __AVX512F__
668
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
669
 
670
- #if __ARM_NEON
671
 
672
  #if !defined(__aarch64__)
673
 
@@ -748,18 +769,18 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
748
 
749
  #define QK4_0 32
750
  typedef struct {
751
- float d; // delta
752
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
753
  } block_q4_0;
754
- static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
755
 
756
  #define QK4_1 32
757
  typedef struct {
758
- float d; // delta
759
- float m; // min
760
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
761
  } block_q4_1;
762
- static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding");
763
 
764
  #define QK5_0 32
765
  typedef struct {
@@ -780,16 +801,16 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
780
 
781
  #define QK8_0 32
782
  typedef struct {
783
- float d; // delta
784
- int8_t qs[QK8_0]; // quants
785
  } block_q8_0;
786
- static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
787
 
788
  #define QK8_1 32
789
  typedef struct {
790
- float d; // delta
791
- float s; // d * sum(qs[i])
792
- int8_t qs[QK8_1]; // quants
793
  } block_q8_1;
794
  static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
795
 
@@ -816,7 +837,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
816
  const float d = max / -8;
817
  const float id = d ? 1.0f/d : 0.0f;
818
 
819
- y[i].d = d;
820
 
821
  for (int j = 0; j < qk/2; ++j) {
822
  const float x0 = x[i*qk + 0 + j]*id;
@@ -856,8 +877,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
856
  const float d = (max - min) / ((1 << 4) - 1);
857
  const float id = d ? 1.0f/d : 0.0f;
858
 
859
- y[i].d = d;
860
- y[i].m = min;
861
 
862
  for (int j = 0; j < qk/2; ++j) {
863
  const float x0 = (x[i*qk + 0 + j] - min)*id;
@@ -988,7 +1009,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
988
  const float d = amax / ((1 << 7) - 1);
989
  const float id = d ? 1.0f/d : 0.0f;
990
 
991
- y[i].d = d;
992
 
993
  for (int j = 0; j < QK8_0; ++j) {
994
  const float x0 = x[i*QK8_0 + j]*id;
@@ -1023,7 +1044,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1023
  const float d = amax / ((1 << 7) - 1);
1024
  const float id = d ? 1.0f/d : 0.0f;
1025
 
1026
- y[i].d = d;
1027
 
1028
  for (int j = 0; j < 8; j++) {
1029
  const float32x4_t v = vmulq_n_f32(srcv[j], id);
@@ -1058,7 +1079,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1058
 
1059
  // Quantize these floats
1060
  const float d = maxScalar / 127.f;
1061
- y[i].d = d;
1062
  const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
1063
  const __m256 mul = _mm256_set1_ps( id );
1064
 
@@ -1157,7 +1178,7 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
1157
  sum += y[i].qs[QK8_1/2 + j];
1158
  }
1159
 
1160
- y[i].s = d * sum;
1161
  }
1162
  }
1163
 
@@ -1309,7 +1330,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
1309
  const int nb = k / qk;
1310
 
1311
  for (int i = 0; i < nb; i++) {
1312
- const float d = x[i].d;
1313
 
1314
  for (int j = 0; j < qk/2; ++j) {
1315
  const int x0 = (x[i].qs[j] & 0x0F) - 8;
@@ -1329,8 +1350,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
1329
  const int nb = k / qk;
1330
 
1331
  for (int i = 0; i < nb; i++) {
1332
- const float d = x[i].d;
1333
- const float m = x[i].m;
1334
 
1335
  for (int j = 0; j < qk/2; ++j) {
1336
  const int x0 = (x[i].qs[j] & 0x0F);
@@ -1405,7 +1426,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
1405
  const block_q8_0 * restrict x = vx;
1406
 
1407
  for (int i = 0; i < nb; i++) {
1408
- const float d = x[i].d;
1409
 
1410
  for (int j = 0; j < qk; ++j) {
1411
  y[i*qk + j] = x[i].qs[j]*d;
@@ -1669,8 +1690,9 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1669
  static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
1670
  float tmp[8];
1671
 
1672
- for (int i = 0; i < 8; i++)
1673
  tmp[i] = GGML_FP16_TO_FP32(x[i]);
 
1674
 
1675
  return _mm256_loadu_ps(tmp);
1676
  }
@@ -2090,8 +2112,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2090
  const block_q8_0 * restrict y0 = &y[i + 0];
2091
  const block_q8_0 * restrict y1 = &y[i + 1];
2092
 
2093
- const uint8x16_t m4b = vdupq_n_u8(0x0F);
2094
- const int8x16_t s8b = vdupq_n_s8(0x8);
2095
 
2096
  const uint8x16_t v0_0 = vld1q_u8(x0->qs);
2097
  const uint8x16_t v0_1 = vld1q_u8(x1->qs);
@@ -2119,8 +2141,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2119
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
2120
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
2121
 
2122
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
2123
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
2124
  #else
2125
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
2126
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
@@ -2137,8 +2159,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2137
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2138
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2139
 
2140
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
2141
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
2142
  #endif
2143
  }
2144
 
@@ -2150,7 +2172,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2150
  // Main loop
2151
  for (int i = 0; i < nb; ++i) {
2152
  /* Compute combined scale for the block */
2153
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
2154
 
2155
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2156
 
@@ -2174,7 +2196,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2174
  // Main loop
2175
  for (int i = 0; i < nb; ++i) {
2176
  // Compute combined scale for the block
2177
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
2178
 
2179
  const __m128i lowMask = _mm_set1_epi8(0xF);
2180
  const __m128i off = _mm_set1_epi8(8);
@@ -2216,7 +2238,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2216
  _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
2217
 
2218
  // Compute combined scale for the block 0 and 1
2219
- const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) );
2220
 
2221
  const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
2222
 
@@ -2234,7 +2256,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2234
  _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
2235
 
2236
  // Compute combined scale for the block 2 and 3
2237
- const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) );
2238
 
2239
  const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
2240
 
@@ -2267,7 +2289,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2267
  _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
2268
 
2269
  // Compute combined scale for the block 0 and 1
2270
- const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) );
2271
 
2272
  const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
2273
 
@@ -2285,7 +2307,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2285
  _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
2286
 
2287
  // Compute combined scale for the block 2 and 3
2288
- const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) );
2289
 
2290
  const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
2291
 
@@ -2333,7 +2355,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2333
  sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
2334
  }
2335
 
2336
- sumf += (x[i].d*y[i].d)*sumi;
2337
  }
2338
 
2339
  *s = sumf;
@@ -2363,7 +2385,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2363
  const block_q8_1 * restrict y0 = &y[i + 0];
2364
  const block_q8_1 * restrict y1 = &y[i + 1];
2365
 
2366
- summs += x0->m * y0->s + x1->m * y1->s;
2367
 
2368
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2369
 
@@ -2387,8 +2409,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2387
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
2388
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
2389
 
2390
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
2391
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
2392
  #else
2393
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
2394
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
@@ -2405,8 +2427,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2405
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2406
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2407
 
2408
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
2409
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
2410
  #endif
2411
  }
2412
 
@@ -2419,13 +2441,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2419
 
2420
  // Main loop
2421
  for (int i = 0; i < nb; ++i) {
2422
- const float * d0 = &x[i].d;
2423
- const float * d1 = &y[i].d;
2424
 
2425
- summs += x[i].m * y[i].s;
2426
 
2427
- const __m256 d0v = _mm256_broadcast_ss( d0 );
2428
- const __m256 d1v = _mm256_broadcast_ss( d1 );
2429
 
2430
  // Compute combined scales
2431
  const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
@@ -2434,7 +2456,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2434
  const __m256i bx = bytes_from_nibbles_32(x[i].qs);
2435
  const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
2436
 
2437
- const __m256 xy = mul_sum_i8_pairs_float(bx, by);
2438
 
2439
  // Accumulate d0*d1*x*y
2440
  #if defined(__AVX2__)
@@ -2459,7 +2481,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2459
  sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
2460
  }
2461
 
2462
- sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s;
2463
  }
2464
 
2465
  *s = sumf;
@@ -2535,16 +2557,13 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2535
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2536
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2537
 
2538
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2539
- const float x1d = GGML_FP16_TO_FP32(x1->d);
2540
-
2541
  #if defined(__ARM_FEATURE_DOTPROD)
2542
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
2543
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
2544
- vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
2545
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
2546
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
2547
- vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
2548
  #else
2549
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
2550
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
@@ -2561,8 +2580,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2561
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2562
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2563
 
2564
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
2565
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
2566
  #endif
2567
  }
2568
 
@@ -2637,7 +2656,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2637
  // Main loop
2638
  for (int i = 0; i < nb; i++) {
2639
  /* Compute combined scale for the block */
2640
- const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
2641
 
2642
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2643
  __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -2661,7 +2680,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2661
  // Main loop
2662
  for (int i = 0; i < nb; i++) {
2663
  /* Compute combined scale for the block */
2664
- const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
2665
 
2666
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2667
  const __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -2704,7 +2723,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2704
  sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
2705
  }
2706
 
2707
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
2708
  }
2709
 
2710
  *s = sumf;
@@ -2786,16 +2805,13 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2786
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2787
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2788
 
2789
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2790
- const float x1d = GGML_FP16_TO_FP32(x1->d);
2791
-
2792
  #if defined(__ARM_FEATURE_DOTPROD)
2793
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
2794
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
2795
- vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
2796
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
2797
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
2798
- vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
2799
  #else
2800
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
2801
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
@@ -2812,8 +2828,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2812
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2813
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2814
 
2815
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
2816
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
2817
  #endif
2818
  }
2819
 
@@ -2873,15 +2889,14 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2873
  const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
2874
  const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
2875
 
2876
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2877
-
2878
  // dot product
2879
- sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
2880
- wasm_i32x4_add(
2881
- wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2882
- wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2883
- wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2884
- wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
 
2885
  }
2886
 
2887
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -2903,10 +2918,10 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2903
  bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
2904
  bx = _mm256_or_si256(bx, bxhi);
2905
 
2906
- const __m256 dy = _mm256_broadcast_ss(&y[i].d);
2907
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2908
 
2909
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
2910
 
2911
  acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
2912
  }
@@ -2937,10 +2952,10 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2937
  bxh = _mm_or_si128(bxh, bxhih);
2938
  bx = _mm256_set_m128i(bxh, bxl);
2939
 
2940
- const __m256 dy = _mm256_broadcast_ss(&y[i].d);
2941
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2942
 
2943
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
2944
 
2945
  acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
2946
  }
@@ -3007,11 +3022,11 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3007
  #if defined(__ARM_FEATURE_DOTPROD)
3008
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
3009
  vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
3010
- vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
3011
 
3012
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
3013
  vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
3014
- vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
3015
 
3016
  #else
3017
  const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
@@ -3029,8 +3044,8 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3029
  const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
3030
  const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
3031
 
3032
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
3033
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
3034
  #endif
3035
  }
3036
 
@@ -3042,7 +3057,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3042
  // Main loop
3043
  for (int i = 0; i < nb; ++i) {
3044
  // Compute combined scale for the block
3045
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
3046
  __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
3047
  __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
3048
 
@@ -3068,7 +3083,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3068
  sumi += x[i].qs[j]*y[i].qs[j];
3069
  }
3070
 
3071
- sumf += (x[i].d*y[i].d)*sumi;
3072
  }
3073
 
3074
  *s = sumf;
@@ -3457,6 +3472,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3457
  "ROPE",
3458
  "ROPE_BACK",
3459
  "ALIBI",
 
3460
  "CONV_1D_1S",
3461
  "CONV_1D_2S",
3462
 
@@ -3467,7 +3483,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3467
  "MAP_BINARY",
3468
  };
3469
 
3470
- static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
 
3471
 
3472
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3473
  "none",
@@ -3517,6 +3534,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3517
  "rope(x)",
3518
  "rope_back(x)",
3519
  "alibi(x)",
 
3520
  "conv_1d_1s(x)",
3521
  "conv_1d_2s(x)",
3522
 
@@ -3527,7 +3545,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3527
  "f(x,y)",
3528
  };
3529
 
3530
- static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
3531
 
3532
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3533
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3761,6 +3779,12 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g
3761
  (t1->ne[3]%t0->ne[3] == 0);
3762
  }
3763
 
 
 
 
 
 
 
3764
  static inline int ggml_up32(int n) {
3765
  return (n + 31) & ~31;
3766
  }
@@ -4643,11 +4667,15 @@ struct ggml_tensor * ggml_mul_impl(
4643
  struct ggml_tensor * a,
4644
  struct ggml_tensor * b,
4645
  bool inplace) {
4646
- GGML_ASSERT(ggml_are_same_shape(a, b));
 
 
4647
 
4648
  bool is_node = false;
4649
 
4650
  if (!inplace && (a->grad || b->grad)) {
 
 
4651
  is_node = true;
4652
  }
4653
 
@@ -6189,7 +6217,8 @@ struct ggml_tensor * ggml_alibi(
6189
  struct ggml_context * ctx,
6190
  struct ggml_tensor * a,
6191
  int n_past,
6192
- int n_head) {
 
6193
  GGML_ASSERT(n_past >= 0);
6194
  bool is_node = false;
6195
 
@@ -6208,6 +6237,8 @@ struct ggml_tensor * ggml_alibi(
6208
 
6209
  ((int32_t *) b->data)[0] = n_past;
6210
  ((int32_t *) b->data)[1] = n_head;
 
 
6211
 
6212
  ggml_scratch_load(ctx);
6213
 
@@ -6219,6 +6250,40 @@ struct ggml_tensor * ggml_alibi(
6219
  return result;
6220
  }
6221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6222
  // ggml_conv_1d_1s
6223
 
6224
  struct ggml_tensor * ggml_conv_1d_1s(
@@ -7945,7 +8010,7 @@ static void ggml_compute_forward_mul_f32(
7945
  const struct ggml_tensor * src0,
7946
  const struct ggml_tensor * src1,
7947
  struct ggml_tensor * dst) {
7948
- assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7949
 
7950
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7951
  return;
@@ -7953,10 +8018,25 @@ static void ggml_compute_forward_mul_f32(
7953
  const int ith = params->ith;
7954
  const int nth = params->nth;
7955
 
7956
- const int nr = ggml_nrows(src0);
7957
- const int64_t ne0 = src0->ne[0];
7958
- const int64_t ne1 = src0->ne[1];
7959
- const int64_t ne2 = src0->ne[2];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7960
 
7961
  const size_t nb00 = src0->nb[0];
7962
  const size_t nb01 = src0->nb[1];
@@ -7975,44 +8055,51 @@ static void ggml_compute_forward_mul_f32(
7975
 
7976
  GGML_ASSERT( nb0 == sizeof(float));
7977
  GGML_ASSERT(nb00 == sizeof(float));
 
7978
 
7979
  if (nb10 == sizeof(float)) {
7980
- for (int ir = ith; ir < nr; ir += nth) {
7981
- // src0, src1 and dst are same shape => same indices
7982
- const int i3 = ir/(ne2*ne1);
7983
- const int i2 = (ir - i3*ne2*ne1)/ne1;
7984
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 
 
 
 
7985
 
 
 
 
7986
 
7987
  #ifdef GGML_USE_ACCELERATE
7988
  UNUSED(ggml_vec_mul_f32);
7989
 
7990
- vDSP_vmul(
7991
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
7992
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
7993
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
7994
- ne0);
7995
  #else
7996
- ggml_vec_mul_f32(ne0,
7997
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
7998
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
7999
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
8000
  #endif
8001
  // }
8002
  // }
8003
  }
8004
  } else {
8005
  // src1 is not contiguous
8006
- for (int ir = ith; ir < nr; ir += nth) {
8007
- // src0, src1 and dst are same shape => same indices
8008
- const int i3 = ir/(ne2*ne1);
8009
- const int i2 = (ir - i3*ne2*ne1)/ne1;
8010
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 
8011
 
8012
- float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
8013
- float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
8014
- for (int i0 = 0; i0 < ne0; i0++) {
8015
- float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
 
 
 
 
 
8016
 
8017
  dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
8018
  }
@@ -10501,34 +10588,29 @@ static void ggml_compute_forward_diag_mask_f32(
10501
  assert(src1->type == GGML_TYPE_I32);
10502
  assert(ggml_nelements(src1) == 2);
10503
 
 
 
 
10504
  const int n_past = ((int32_t *) src1->data)[0];
10505
  const bool inplace = (bool)((int32_t *) src1->data)[1];
10506
 
10507
- if (params->type == GGML_TASK_INIT) {
10508
- // TODO: this hack is not good, need a better way to handle this
10509
- if (!inplace) {
10510
- // use the init task to copy src -> dst
10511
- struct ggml_compute_params params_cpy = *params;
10512
-
10513
- params_cpy.ith = 0;
10514
- params_cpy.nth = 1;
10515
- params_cpy.type = GGML_TASK_COMPUTE;
10516
-
10517
- ggml_compute_forward_dup_same_cont(&params_cpy, src0, dst);
10518
- }
10519
 
10520
- return;
 
 
 
 
 
 
 
 
10521
  }
10522
 
10523
- if (params->type == GGML_TASK_FINALIZE) {
10524
  return;
10525
  }
10526
 
10527
- const int ith = params->ith;
10528
- const int nth = params->nth;
10529
-
10530
- assert(n_past >= 0);
10531
-
10532
  // TODO: handle transposed/permuted matrices
10533
 
10534
  const int n = ggml_nrows(src0);
@@ -10682,14 +10764,15 @@ static void ggml_compute_forward_alibi_f32(
10682
  struct ggml_tensor * dst) {
10683
  assert(params->ith == 0);
10684
  assert(src1->type == GGML_TYPE_I32);
10685
- assert(ggml_nelements(src1) == 2);
10686
 
10687
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10688
  return;
10689
  }
10690
 
10691
- const int n_past = ((int32_t *) src1->data)[0];
10692
- const int n_head = ((int32_t *) src1->data)[1];
 
10693
 
10694
  assert(n_past >= 0);
10695
 
@@ -10712,8 +10795,8 @@ static void ggml_compute_forward_alibi_f32(
10712
  // add alibi to src0 (KQ_scaled)
10713
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
10714
 
10715
- const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
10716
- const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
10717
 
10718
  for (int i = 0; i < ne0; i++) {
10719
  for (int j = 0; j < ne1; j++) {
@@ -10731,13 +10814,13 @@ static void ggml_compute_forward_alibi_f32(
10731
  m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
10732
  }
10733
 
10734
- pdst[0] = i * m_k + src[0];
 
10735
  }
10736
  }
10737
  }
10738
  }
10739
 
10740
-
10741
  static void ggml_compute_forward_alibi_f16(
10742
  const struct ggml_compute_params * params,
10743
  const struct ggml_tensor * src0,
@@ -10745,14 +10828,15 @@ static void ggml_compute_forward_alibi_f16(
10745
  struct ggml_tensor * dst) {
10746
  assert(params->ith == 0);
10747
  assert(src1->type == GGML_TYPE_I32);
10748
- assert(ggml_nelements(src1) == 2);
10749
 
10750
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10751
  return;
10752
  }
10753
 
10754
- const int n_past = ((int32_t *) src1->data)[0];
10755
- const int n_head = ((int32_t *) src1->data)[1];
 
10756
 
10757
  assert(n_past >= 0);
10758
 
@@ -10775,8 +10859,8 @@ static void ggml_compute_forward_alibi_f16(
10775
  // add alibi to src0 (KQ_scaled)
10776
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
10777
 
10778
- const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
10779
- const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
10780
 
10781
  for (int i = 0; i < ne0; i++) {
10782
  for (int j = 0; j < ne1; j++) {
@@ -10795,7 +10879,7 @@ static void ggml_compute_forward_alibi_f16(
10795
  }
10796
 
10797
  // we return F32
10798
- pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
10799
  }
10800
  }
10801
  }
@@ -10831,6 +10915,77 @@ static void ggml_compute_forward_alibi(
10831
  }
10832
  }
10833
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10834
  // ggml_compute_forward_rope
10835
 
10836
  static void ggml_compute_forward_rope_f32(
@@ -12812,6 +12967,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
12812
  {
12813
  ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
12814
  } break;
 
 
 
 
12815
  case GGML_OP_CONV_1D_1S:
12816
  {
12817
  ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
@@ -13119,6 +13278,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
13119
  {
13120
  GGML_ASSERT(false); // TODO: not implemented
13121
  } break;
 
 
 
 
13122
  case GGML_OP_SILU:
13123
  {
13124
  // necessary for llama
@@ -13998,6 +14161,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
13998
  {
13999
  node->n_tasks = 1; //TODO
14000
  } break;
 
 
 
 
14001
  case GGML_OP_CONV_1D_1S:
14002
  case GGML_OP_CONV_1D_2S:
14003
  {
 
512
  return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
513
  }
514
 
515
+ #if defined(__AVX2__) || defined(__AVX512F__)
516
  // spread 32 bits to 32 bytes { 0x00, 0xFF }
517
  static inline __m256i bytes_from_bits_32(const uint8_t * x) {
518
  uint32_t x32;
 
543
  return _mm256_cvtepi32_ps(summed_pairs);
544
  }
545
 
546
+ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
 
 
 
 
 
547
  #if __AVXVNNI__
548
  const __m256i zero = _mm256_setzero_si256();
549
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
 
555
  #endif
556
  }
557
 
558
+ // multiply int8_t, add results pairwise twice and return as float vector
559
+ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
560
+ #if __AVXVNNIINT8__
561
+ const __m256i zero = _mm256_setzero_si256();
562
+ const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
563
+ return _mm256_cvtepi32_ps(summed_pairs);
564
+ #else
565
+ // Get absolute values of x vectors
566
+ const __m256i ax = _mm256_sign_epi8(x, x);
567
+ // Sign the values of the y vectors
568
+ const __m256i sy = _mm256_sign_epi8(y, x);
569
+ return mul_sum_us8_pairs_float(ax, sy);
570
+ #endif
571
+ }
572
+
573
  static inline __m128i packNibbles( __m256i bytes )
574
  {
575
  // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
 
629
  return _mm256_cvtepi32_ps(summed_pairs);
630
  }
631
 
632
+ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
633
+ const __m128i axl = _mm256_castsi256_si128(ax);
634
+ const __m128i axh = _mm256_extractf128_si256(ax, 1);
635
+ const __m128i syl = _mm256_castsi256_si128(sy);
636
+ const __m128i syh = _mm256_extractf128_si256(sy, 1);
637
+ // Perform multiplication and create 16-bit values
638
+ const __m128i dotl = _mm_maddubs_epi16(axl, syl);
639
+ const __m128i doth = _mm_maddubs_epi16(axh, syh);
640
+ return sum_i16_pairs_float(doth, dotl);
641
+ }
642
+
643
  // multiply int8_t, add results pairwise twice and return as float vector
644
  static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
645
  const __m128i xl = _mm256_castsi256_si128(x);
 
688
  #endif // __AVX__ || __AVX2__ || __AVX512F__
689
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
690
 
691
+ #if defined(__ARM_NEON)
692
 
693
  #if !defined(__aarch64__)
694
 
 
769
 
770
  #define QK4_0 32
771
  typedef struct {
772
+ ggml_fp16_t d; // delta
773
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
774
  } block_q4_0;
775
+ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
776
 
777
  #define QK4_1 32
778
  typedef struct {
779
+ ggml_fp16_t d; // delta
780
+ ggml_fp16_t m; // min
781
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
782
  } block_q4_1;
783
+ static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
784
 
785
  #define QK5_0 32
786
  typedef struct {
 
801
 
802
  #define QK8_0 32
803
  typedef struct {
804
+ ggml_fp16_t d; // delta
805
+ int8_t qs[QK8_0]; // quants
806
  } block_q8_0;
807
+ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
808
 
809
  #define QK8_1 32
810
  typedef struct {
811
+ float d; // delta
812
+ float s; // d * sum(qs[i])
813
+ int8_t qs[QK8_1]; // quants
814
  } block_q8_1;
815
  static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
816
 
 
837
  const float d = max / -8;
838
  const float id = d ? 1.0f/d : 0.0f;
839
 
840
+ y[i].d = GGML_FP32_TO_FP16(d);
841
 
842
  for (int j = 0; j < qk/2; ++j) {
843
  const float x0 = x[i*qk + 0 + j]*id;
 
877
  const float d = (max - min) / ((1 << 4) - 1);
878
  const float id = d ? 1.0f/d : 0.0f;
879
 
880
+ y[i].d = GGML_FP32_TO_FP16(d);
881
+ y[i].m = GGML_FP32_TO_FP16(min);
882
 
883
  for (int j = 0; j < qk/2; ++j) {
884
  const float x0 = (x[i*qk + 0 + j] - min)*id;
 
1009
  const float d = amax / ((1 << 7) - 1);
1010
  const float id = d ? 1.0f/d : 0.0f;
1011
 
1012
+ y[i].d = GGML_FP32_TO_FP16(d);
1013
 
1014
  for (int j = 0; j < QK8_0; ++j) {
1015
  const float x0 = x[i*QK8_0 + j]*id;
 
1044
  const float d = amax / ((1 << 7) - 1);
1045
  const float id = d ? 1.0f/d : 0.0f;
1046
 
1047
+ y[i].d = GGML_FP32_TO_FP16(d);
1048
 
1049
  for (int j = 0; j < 8; j++) {
1050
  const float32x4_t v = vmulq_n_f32(srcv[j], id);
 
1079
 
1080
  // Quantize these floats
1081
  const float d = maxScalar / 127.f;
1082
+ y[i].d = GGML_FP32_TO_FP16(d);
1083
  const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
1084
  const __m256 mul = _mm256_set1_ps( id );
1085
 
 
1178
  sum += y[i].qs[QK8_1/2 + j];
1179
  }
1180
 
1181
+ y[i].s = sum*d;
1182
  }
1183
  }
1184
 
 
1330
  const int nb = k / qk;
1331
 
1332
  for (int i = 0; i < nb; i++) {
1333
+ const float d = GGML_FP16_TO_FP32(x[i].d);
1334
 
1335
  for (int j = 0; j < qk/2; ++j) {
1336
  const int x0 = (x[i].qs[j] & 0x0F) - 8;
 
1350
  const int nb = k / qk;
1351
 
1352
  for (int i = 0; i < nb; i++) {
1353
+ const float d = GGML_FP16_TO_FP32(x[i].d);
1354
+ const float m = GGML_FP16_TO_FP32(x[i].m);
1355
 
1356
  for (int j = 0; j < qk/2; ++j) {
1357
  const int x0 = (x[i].qs[j] & 0x0F);
 
1426
  const block_q8_0 * restrict x = vx;
1427
 
1428
  for (int i = 0; i < nb; i++) {
1429
+ const float d = GGML_FP16_TO_FP32(x[i].d);
1430
 
1431
  for (int j = 0; j < qk; ++j) {
1432
  y[i*qk + j] = x[i].qs[j]*d;
 
1690
  static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
1691
  float tmp[8];
1692
 
1693
+ for (int i = 0; i < 8; i++) {
1694
  tmp[i] = GGML_FP16_TO_FP32(x[i]);
1695
+ }
1696
 
1697
  return _mm256_loadu_ps(tmp);
1698
  }
 
2112
  const block_q8_0 * restrict y0 = &y[i + 0];
2113
  const block_q8_0 * restrict y1 = &y[i + 1];
2114
 
2115
+ const uint8x16_t m4b = vdupq_n_u8(0x0F);
2116
+ const int8x16_t s8b = vdupq_n_s8(0x8);
2117
 
2118
  const uint8x16_t v0_0 = vld1q_u8(x0->qs);
2119
  const uint8x16_t v0_1 = vld1q_u8(x1->qs);
 
2141
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
2142
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
2143
 
2144
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2145
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2146
  #else
2147
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
2148
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
 
2159
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2160
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2161
 
2162
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2163
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2164
  #endif
2165
  }
2166
 
 
2172
  // Main loop
2173
  for (int i = 0; i < nb; ++i) {
2174
  /* Compute combined scale for the block */
2175
+ const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2176
 
2177
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2178
 
 
2196
  // Main loop
2197
  for (int i = 0; i < nb; ++i) {
2198
  // Compute combined scale for the block
2199
+ const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2200
 
2201
  const __m128i lowMask = _mm_set1_epi8(0xF);
2202
  const __m128i off = _mm_set1_epi8(8);
 
2238
  _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
2239
 
2240
  // Compute combined scale for the block 0 and 1
2241
+ const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
2242
 
2243
  const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
2244
 
 
2256
  _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
2257
 
2258
  // Compute combined scale for the block 2 and 3
2259
+ const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
2260
 
2261
  const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
2262
 
 
2289
  _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
2290
 
2291
  // Compute combined scale for the block 0 and 1
2292
+ const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2293
 
2294
  const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
2295
 
 
2307
  _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
2308
 
2309
  // Compute combined scale for the block 2 and 3
2310
+ const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
2311
 
2312
  const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
2313
 
 
2355
  sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
2356
  }
2357
 
2358
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2359
  }
2360
 
2361
  *s = sumf;
 
2385
  const block_q8_1 * restrict y0 = &y[i + 0];
2386
  const block_q8_1 * restrict y1 = &y[i + 1];
2387
 
2388
+ summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
2389
 
2390
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2391
 
 
2409
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
2410
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
2411
 
2412
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
2413
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
2414
  #else
2415
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
2416
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
 
2427
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2428
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2429
 
2430
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
2431
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
2432
  #endif
2433
  }
2434
 
 
2441
 
2442
  // Main loop
2443
  for (int i = 0; i < nb; ++i) {
2444
+ const float d0 = GGML_FP16_TO_FP32(x[i].d);
2445
+ const float d1 = y[i].d;
2446
 
2447
+ summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
2448
 
2449
+ const __m256 d0v = _mm256_set1_ps( d0 );
2450
+ const __m256 d1v = _mm256_set1_ps( d1 );
2451
 
2452
  // Compute combined scales
2453
  const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
 
2456
  const __m256i bx = bytes_from_nibbles_32(x[i].qs);
2457
  const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
2458
 
2459
+ const __m256 xy = mul_sum_us8_pairs_float(bx, by);
2460
 
2461
  // Accumulate d0*d1*x*y
2462
  #if defined(__AVX2__)
 
2481
  sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
2482
  }
2483
 
2484
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2485
  }
2486
 
2487
  *s = sumf;
 
2557
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2558
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2559
 
 
 
 
2560
  #if defined(__ARM_FEATURE_DOTPROD)
2561
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
2562
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
2563
+ vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2564
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
2565
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
2566
+ vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2567
  #else
2568
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
2569
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
 
2580
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2581
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2582
 
2583
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2584
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2585
  #endif
2586
  }
2587
 
 
2656
  // Main loop
2657
  for (int i = 0; i < nb; i++) {
2658
  /* Compute combined scale for the block */
2659
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
2660
 
2661
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2662
  __m256i bxhi = bytes_from_bits_32(x[i].qh);
 
2680
  // Main loop
2681
  for (int i = 0; i < nb; i++) {
2682
  /* Compute combined scale for the block */
2683
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
2684
 
2685
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2686
  const __m256i bxhi = bytes_from_bits_32(x[i].qh);
 
2723
  sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
2724
  }
2725
 
2726
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
2727
  }
2728
 
2729
  *s = sumf;
 
2805
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2806
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2807
 
 
 
 
2808
  #if defined(__ARM_FEATURE_DOTPROD)
2809
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
2810
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
2811
+ vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
2812
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
2813
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
2814
+ vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
2815
  #else
2816
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
2817
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
 
2828
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2829
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2830
 
2831
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
2832
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
2833
  #endif
2834
  }
2835
 
 
2889
  const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
2890
  const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
2891
 
 
 
2892
  // dot product
2893
+ sumv = wasm_f32x4_add(sumv,
2894
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
2895
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2896
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2897
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2898
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2899
+ wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d));
2900
  }
2901
 
2902
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
 
2918
  bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
2919
  bx = _mm256_or_si256(bx, bxhi);
2920
 
2921
+ const __m256 dy = _mm256_set1_ps(y[i].d);
2922
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2923
 
2924
+ const __m256 q = mul_sum_us8_pairs_float(bx, by);
2925
 
2926
  acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
2927
  }
 
2952
  bxh = _mm_or_si128(bxh, bxhih);
2953
  bx = _mm256_set_m128i(bxh, bxl);
2954
 
2955
+ const __m256 dy = _mm256_set1_ps(y[i].d);
2956
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2957
 
2958
+ const __m256 q = mul_sum_us8_pairs_float(bx, by);
2959
 
2960
  acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
2961
  }
 
3022
  #if defined(__ARM_FEATURE_DOTPROD)
3023
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
3024
  vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
3025
+ vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
3026
 
3027
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
3028
  vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
3029
+ vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
3030
 
3031
  #else
3032
  const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
 
3044
  const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
3045
  const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
3046
 
3047
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
3048
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
3049
  #endif
3050
  }
3051
 
 
3057
  // Main loop
3058
  for (int i = 0; i < nb; ++i) {
3059
  // Compute combined scale for the block
3060
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
3061
  __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
3062
  __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
3063
 
 
3083
  sumi += x[i].qs[j]*y[i].qs[j];
3084
  }
3085
 
3086
+ sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
3087
  }
3088
 
3089
  *s = sumf;
 
3472
  "ROPE",
3473
  "ROPE_BACK",
3474
  "ALIBI",
3475
+ "CLAMP",
3476
  "CONV_1D_1S",
3477
  "CONV_1D_2S",
3478
 
 
3483
  "MAP_BINARY",
3484
  };
3485
 
3486
+ static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
3487
+
3488
 
3489
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3490
  "none",
 
3534
  "rope(x)",
3535
  "rope_back(x)",
3536
  "alibi(x)",
3537
+ "clamp(x)",
3538
  "conv_1d_1s(x)",
3539
  "conv_1d_2s(x)",
3540
 
 
3545
  "f(x,y)",
3546
  };
3547
 
3548
+ static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
3549
 
3550
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3551
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
 
3779
  (t1->ne[3]%t0->ne[3] == 0);
3780
  }
3781
 
3782
+ static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3783
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3784
+
3785
+ return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
3786
+ }
3787
+
3788
  static inline int ggml_up32(int n) {
3789
  return (n + 31) & ~31;
3790
  }
 
4667
  struct ggml_tensor * a,
4668
  struct ggml_tensor * b,
4669
  bool inplace) {
4670
+ // TODO: support less-strict constraint
4671
+ // GGML_ASSERT(ggml_can_repeat(b, a));
4672
+ GGML_ASSERT(ggml_can_repeat_rows(b, a));
4673
 
4674
  bool is_node = false;
4675
 
4676
  if (!inplace && (a->grad || b->grad)) {
4677
+ // TODO: support backward pass for broadcasting
4678
+ GGML_ASSERT(ggml_are_same_shape(a, b));
4679
  is_node = true;
4680
  }
4681
 
 
6217
  struct ggml_context * ctx,
6218
  struct ggml_tensor * a,
6219
  int n_past,
6220
+ int n_head,
6221
+ float bias_max) {
6222
  GGML_ASSERT(n_past >= 0);
6223
  bool is_node = false;
6224
 
 
6237
 
6238
  ((int32_t *) b->data)[0] = n_past;
6239
  ((int32_t *) b->data)[1] = n_head;
6240
+ GGML_ASSERT(sizeof(float) == sizeof(int32_t));
6241
+ (((float *) b->data)[2]) = bias_max;
6242
 
6243
  ggml_scratch_load(ctx);
6244
 
 
6250
  return result;
6251
  }
6252
 
6253
+ // ggml_clamp
6254
+
6255
+ struct ggml_tensor * ggml_clamp(
6256
+ struct ggml_context * ctx,
6257
+ struct ggml_tensor * a,
6258
+ float min,
6259
+ float max) {
6260
+ bool is_node = false;
6261
+
6262
+ if (a->grad) {
6263
+ GGML_ASSERT(false); // TODO: implement backward
6264
+ is_node = true;
6265
+ }
6266
+
6267
+ // TODO: when implement backward, fix this:
6268
+ struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6269
+
6270
+ ggml_scratch_save(ctx);
6271
+
6272
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6273
+
6274
+ ((float *) b->data)[0] = min;
6275
+ ((float *) b->data)[1] = max;
6276
+
6277
+ ggml_scratch_load(ctx);
6278
+
6279
+ result->op = GGML_OP_CLAMP;
6280
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6281
+ result->src0 = a;
6282
+ result->src1 = b;
6283
+
6284
+ return result;
6285
+ }
6286
+
6287
  // ggml_conv_1d_1s
6288
 
6289
  struct ggml_tensor * ggml_conv_1d_1s(
 
8010
  const struct ggml_tensor * src0,
8011
  const struct ggml_tensor * src1,
8012
  struct ggml_tensor * dst) {
8013
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
8014
 
8015
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8016
  return;
 
8018
  const int ith = params->ith;
8019
  const int nth = params->nth;
8020
 
8021
+ #ifdef GGML_USE_CUBLAS
8022
+ if (src1->backend == GGML_BACKEND_CUDA) {
8023
+ if (ith == 0) {
8024
+ ggml_cuda_mul(src0, src1, dst);
8025
+ }
8026
+ return;
8027
+ }
8028
+ #endif
8029
+
8030
+ const int64_t nr = ggml_nrows(src0);
8031
+
8032
+ const int64_t ne00 = src0->ne[0];
8033
+ const int64_t ne01 = src0->ne[1];
8034
+ const int64_t ne02 = src0->ne[2];
8035
+
8036
+ const int64_t ne10 = src1->ne[0];
8037
+ const int64_t ne11 = src1->ne[1];
8038
+ const int64_t ne12 = src1->ne[2];
8039
+ const int64_t ne13 = src1->ne[3];
8040
 
8041
  const size_t nb00 = src0->nb[0];
8042
  const size_t nb01 = src0->nb[1];
 
8055
 
8056
  GGML_ASSERT( nb0 == sizeof(float));
8057
  GGML_ASSERT(nb00 == sizeof(float));
8058
+ GGML_ASSERT(ne00 == ne10);
8059
 
8060
  if (nb10 == sizeof(float)) {
8061
+ for (int64_t ir = ith; ir < nr; ir += nth) {
8062
+ // src0 and dst are same shape => same indices
8063
+ const int64_t i03 = ir/(ne02*ne01);
8064
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
8065
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
8066
+
8067
+ const int64_t i13 = i03 % ne13;
8068
+ const int64_t i12 = i02 % ne12;
8069
+ const int64_t i11 = i01 % ne11;
8070
 
8071
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
8072
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
8073
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
8074
 
8075
  #ifdef GGML_USE_ACCELERATE
8076
  UNUSED(ggml_vec_mul_f32);
8077
 
8078
+ vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
 
 
 
 
8079
  #else
8080
+ ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
 
 
 
8081
  #endif
8082
  // }
8083
  // }
8084
  }
8085
  } else {
8086
  // src1 is not contiguous
8087
+ for (int64_t ir = ith; ir < nr; ir += nth) {
8088
+ // src0 and dst are same shape => same indices
8089
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
8090
+ const int64_t i03 = ir/(ne02*ne01);
8091
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
8092
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
8093
 
8094
+ const int64_t i13 = i03 % ne13;
8095
+ const int64_t i12 = i02 % ne12;
8096
+ const int64_t i11 = i01 % ne11;
8097
+
8098
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
8099
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
8100
+
8101
+ for (int64_t i0 = 0; i0 < ne00; i0++) {
8102
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
8103
 
8104
  dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
8105
  }
 
10588
  assert(src1->type == GGML_TYPE_I32);
10589
  assert(ggml_nelements(src1) == 2);
10590
 
10591
+ const int ith = params->ith;
10592
+ const int nth = params->nth;
10593
+
10594
  const int n_past = ((int32_t *) src1->data)[0];
10595
  const bool inplace = (bool)((int32_t *) src1->data)[1];
10596
 
10597
+ assert(n_past >= 0);
 
 
 
 
 
 
 
 
 
 
 
10598
 
10599
+ if (!inplace && (params->type == GGML_TASK_INIT)) {
10600
+ // memcpy needs to be synchronized across threads to avoid race conditions.
10601
+ // => do it in INIT phase
10602
+ GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
10603
+ GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
10604
+ memcpy(
10605
+ ((char *) dst->data),
10606
+ ((char *) src0->data),
10607
+ ggml_nbytes(dst));
10608
  }
10609
 
10610
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10611
  return;
10612
  }
10613
 
 
 
 
 
 
10614
  // TODO: handle transposed/permuted matrices
10615
 
10616
  const int n = ggml_nrows(src0);
 
10764
  struct ggml_tensor * dst) {
10765
  assert(params->ith == 0);
10766
  assert(src1->type == GGML_TYPE_I32);
10767
+ assert(ggml_nelements(src1) == 3);
10768
 
10769
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10770
  return;
10771
  }
10772
 
10773
+ const int n_past = ((int32_t *) src1->data)[0];
10774
+ const int n_head = ((int32_t *) src1->data)[1];
10775
+ const float max_bias = ((float *) src1->data)[2];
10776
 
10777
  assert(n_past >= 0);
10778
 
 
10795
  // add alibi to src0 (KQ_scaled)
10796
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
10797
 
10798
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
10799
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
10800
 
10801
  for (int i = 0; i < ne0; i++) {
10802
  for (int j = 0; j < ne1; j++) {
 
10814
  m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
10815
  }
10816
 
10817
+ pdst[0] = (i-ne0+1) * m_k + src[0];
10818
+
10819
  }
10820
  }
10821
  }
10822
  }
10823
 
 
10824
  static void ggml_compute_forward_alibi_f16(
10825
  const struct ggml_compute_params * params,
10826
  const struct ggml_tensor * src0,
 
10828
  struct ggml_tensor * dst) {
10829
  assert(params->ith == 0);
10830
  assert(src1->type == GGML_TYPE_I32);
10831
+ assert(ggml_nelements(src1) == 3);
10832
 
10833
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10834
  return;
10835
  }
10836
 
10837
+ const int n_past = ((int32_t *) src1->data)[0];
10838
+ const int n_head = ((int32_t *) src1->data)[1];
10839
+ const float max_bias = ((float *) src1->data)[2];
10840
 
10841
  assert(n_past >= 0);
10842
 
 
10859
  // add alibi to src0 (KQ_scaled)
10860
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
10861
 
10862
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
10863
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
10864
 
10865
  for (int i = 0; i < ne0; i++) {
10866
  for (int j = 0; j < ne1; j++) {
 
10879
  }
10880
 
10881
  // we return F32
10882
+ pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
10883
  }
10884
  }
10885
  }
 
10915
  }
10916
  }
10917
 
10918
+
10919
+ // ggml_compute_forward_clamp
10920
+
10921
+ static void ggml_compute_forward_clamp_f32(
10922
+ const struct ggml_compute_params * params,
10923
+ const struct ggml_tensor * src0,
10924
+ const struct ggml_tensor * src1,
10925
+ struct ggml_tensor * dst) {
10926
+ assert(params->ith == 0);
10927
+ assert(src1->type == GGML_TYPE_I32);
10928
+ assert(ggml_nelements(src1) == 2);
10929
+
10930
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10931
+ return;
10932
+ }
10933
+
10934
+ const int min = ((float *) src1->data)[0];
10935
+ const int max = ((float *) src1->data)[1];
10936
+
10937
+ const int ith = params->ith;
10938
+ const int nth = params->nth;
10939
+
10940
+ const int n = ggml_nrows(src0);
10941
+ const int nc = src0->ne[0];
10942
+
10943
+ const size_t nb00 = src0->nb[0];
10944
+ const size_t nb01 = src0->nb[1];
10945
+
10946
+ const size_t nb0 = dst->nb[0];
10947
+ const size_t nb1 = dst->nb[1];
10948
+
10949
+ GGML_ASSERT( nb0 == sizeof(float));
10950
+ GGML_ASSERT(nb00 == sizeof(float));
10951
+
10952
+ for (int j = ith; j < n; j += nth) {
10953
+ float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
10954
+ float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
10955
+
10956
+ for (int i = 0; i < nc; i++) {
10957
+ dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
10958
+ }
10959
+ }
10960
+ }
10961
+
10962
+ static void ggml_compute_forward_clamp(
10963
+ const struct ggml_compute_params * params,
10964
+ const struct ggml_tensor * src0,
10965
+ const struct ggml_tensor * src1,
10966
+ struct ggml_tensor * dst) {
10967
+ switch (src0->type) {
10968
+ case GGML_TYPE_F32:
10969
+ {
10970
+ ggml_compute_forward_clamp_f32(params, src0, src1, dst);
10971
+ } break;
10972
+ case GGML_TYPE_F16:
10973
+ case GGML_TYPE_Q4_0:
10974
+ case GGML_TYPE_Q4_1:
10975
+ case GGML_TYPE_Q5_0:
10976
+ case GGML_TYPE_Q5_1:
10977
+ case GGML_TYPE_Q8_0:
10978
+ case GGML_TYPE_Q8_1:
10979
+ case GGML_TYPE_I8:
10980
+ case GGML_TYPE_I16:
10981
+ case GGML_TYPE_I32:
10982
+ case GGML_TYPE_COUNT:
10983
+ {
10984
+ GGML_ASSERT(false);
10985
+ } break;
10986
+ }
10987
+ }
10988
+
10989
  // ggml_compute_forward_rope
10990
 
10991
  static void ggml_compute_forward_rope_f32(
 
12967
  {
12968
  ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
12969
  } break;
12970
+ case GGML_OP_CLAMP:
12971
+ {
12972
+ ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
12973
+ } break;
12974
  case GGML_OP_CONV_1D_1S:
12975
  {
12976
  ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
 
13278
  {
13279
  GGML_ASSERT(false); // TODO: not implemented
13280
  } break;
13281
+ case GGML_OP_CLAMP:
13282
+ {
13283
+ GGML_ASSERT(false); // TODO: not implemented
13284
+ } break;
13285
  case GGML_OP_SILU:
13286
  {
13287
  // necessary for llama
 
14161
  {
14162
  node->n_tasks = 1; //TODO
14163
  } break;
14164
+ case GGML_OP_CLAMP:
14165
+ {
14166
+ node->n_tasks = 1; //TODO
14167
+ } break;
14168
  case GGML_OP_CONV_1D_1S:
14169
  case GGML_OP_CONV_1D_2S:
14170
  {
ggml.h CHANGED
@@ -190,7 +190,7 @@
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
  #define GGML_FILE_VERSION 1
192
 
193
- #define GGML_QNT_VERSION 1 // bump this on quantization format changes
194
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
 
196
  #define GGML_MAX_DIMS 4
@@ -313,6 +313,7 @@ extern "C" {
313
  GGML_OP_ROPE,
314
  GGML_OP_ROPE_BACK,
315
  GGML_OP_ALIBI,
 
316
  GGML_OP_CONV_1D_1S,
317
  GGML_OP_CONV_1D_2S,
318
 
@@ -849,7 +850,7 @@ extern "C" {
849
  int n_past);
850
 
851
  // in-place, returns view(a)
852
- GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
853
  struct ggml_context * ctx,
854
  struct ggml_tensor * a,
855
  int n_past);
@@ -897,7 +898,16 @@ extern "C" {
897
  struct ggml_context * ctx,
898
  struct ggml_tensor * a,
899
  int n_past,
900
- int n_head);
 
 
 
 
 
 
 
 
 
901
 
902
  // padding = 1
903
  // TODO: we don't support extra parameters for now
 
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
  #define GGML_FILE_VERSION 1
192
 
193
+ #define GGML_QNT_VERSION 2 // bump this on quantization format changes
194
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
 
196
  #define GGML_MAX_DIMS 4
 
313
  GGML_OP_ROPE,
314
  GGML_OP_ROPE_BACK,
315
  GGML_OP_ALIBI,
316
+ GGML_OP_CLAMP,
317
  GGML_OP_CONV_1D_1S,
318
  GGML_OP_CONV_1D_2S,
319
 
 
850
  int n_past);
851
 
852
  // in-place, returns view(a)
853
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
854
  struct ggml_context * ctx,
855
  struct ggml_tensor * a,
856
  int n_past);
 
898
  struct ggml_context * ctx,
899
  struct ggml_tensor * a,
900
  int n_past,
901
+ int n_head,
902
+ float bias_max);
903
+
904
+ // clamp
905
+ // in-place, returns view(a)
906
+ struct ggml_tensor * ggml_clamp(
907
+ struct ggml_context * ctx,
908
+ struct ggml_tensor * a,
909
+ float min,
910
+ float max);
911
 
912
  // padding = 1
913
  // TODO: we don't support extra parameters for now