Spaces:
Running
Running
ggml : sync latest ggml repo
Browse files- new Q4 and Q8 quantization
- updated CUDA
- examples/common.cpp +2 -1
- ggml-cuda.cu +159 -132
- ggml-cuda.h +2 -0
- ggml.c +322 -155
- ggml.h +13 -3
examples/common.cpp
CHANGED
|
@@ -26,7 +26,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
| 26 |
} else if (arg == "-n" || arg == "--n_predict") {
|
| 27 |
params.n_predict = std::stoi(argv[++i]);
|
| 28 |
} else if (arg == "--top_k") {
|
| 29 |
-
params.top_k = std::stoi(argv[++i]);
|
| 30 |
} else if (arg == "--top_p") {
|
| 31 |
params.top_p = std::stof(argv[++i]);
|
| 32 |
} else if (arg == "--temp") {
|
|
@@ -259,6 +259,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|
| 259 |
if (it != vocab.token_to_id.end()) {
|
| 260 |
tokens.push_back(it->second);
|
| 261 |
i = j;
|
|
|
|
| 262 |
break;
|
| 263 |
}
|
| 264 |
--j;
|
|
|
|
| 26 |
} else if (arg == "-n" || arg == "--n_predict") {
|
| 27 |
params.n_predict = std::stoi(argv[++i]);
|
| 28 |
} else if (arg == "--top_k") {
|
| 29 |
+
params.top_k = std::max(1, std::stoi(argv[++i]));
|
| 30 |
} else if (arg == "--top_p") {
|
| 31 |
params.top_p = std::stof(argv[++i]);
|
| 32 |
} else if (arg == "--temp") {
|
|
|
|
| 259 |
if (it != vocab.token_to_id.end()) {
|
| 260 |
tokens.push_back(it->second);
|
| 261 |
i = j;
|
| 262 |
+
j = n;
|
| 263 |
break;
|
| 264 |
}
|
| 265 |
--j;
|
ggml-cuda.cu
CHANGED
|
@@ -42,19 +42,19 @@ typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y,
|
|
| 42 |
#define QK4_0 32
|
| 43 |
#define QR4_0 2
|
| 44 |
typedef struct {
|
| 45 |
-
|
| 46 |
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
| 47 |
} block_q4_0;
|
| 48 |
-
static_assert(sizeof(block_q4_0) == sizeof(
|
| 49 |
|
| 50 |
#define QK4_1 32
|
| 51 |
#define QR4_1 2
|
| 52 |
typedef struct {
|
| 53 |
-
|
| 54 |
-
|
| 55 |
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
| 56 |
} block_q4_1;
|
| 57 |
-
static_assert(sizeof(block_q4_1) == sizeof(
|
| 58 |
|
| 59 |
#define QK5_0 32
|
| 60 |
#define QR5_0 2
|
|
@@ -78,12 +78,23 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
| 78 |
#define QK8_0 32
|
| 79 |
#define QR8_0 1
|
| 80 |
typedef struct {
|
| 81 |
-
|
| 82 |
int8_t qs[QK8_0]; // quants
|
| 83 |
} block_q8_0;
|
| 84 |
-
static_assert(sizeof(block_q8_0) == sizeof(
|
| 85 |
|
| 86 |
-
#define
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
| 89 |
const block_q4_0 * x = (const block_q4_0 *) vx;
|
|
@@ -170,104 +181,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
| 170 |
v1 = __half2float(x[ib + 1]);
|
| 171 |
}
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
const block_q4_0 * x = (const block_q4_0 *) vx;
|
| 177 |
-
|
| 178 |
-
const int i = blockIdx.x;
|
| 179 |
-
|
| 180 |
-
const float d = x[i].d;
|
| 181 |
-
|
| 182 |
-
for (int j = 0; j < qk/2; ++j) {
|
| 183 |
-
const int x0 = (x[i].qs[j] & 0xf) - 8;
|
| 184 |
-
const int x1 = (x[i].qs[j] >> 4) - 8;
|
| 185 |
-
|
| 186 |
-
y[i*qk + j + 0 ] = x0*d;
|
| 187 |
-
y[i*qk + j + qk/2] = x1*d;
|
| 188 |
-
}
|
| 189 |
-
}
|
| 190 |
-
|
| 191 |
-
static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
|
| 192 |
-
static const int qk = QK4_1;
|
| 193 |
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
const int i = blockIdx.x;
|
| 197 |
-
|
| 198 |
-
const float d = x[i].d;
|
| 199 |
-
const float m = x[i].m;
|
| 200 |
-
|
| 201 |
-
for (int j = 0; j < qk/2; ++j) {
|
| 202 |
-
const int x0 = (x[i].qs[j] & 0xf);
|
| 203 |
-
const int x1 = (x[i].qs[j] >> 4);
|
| 204 |
-
|
| 205 |
-
y[i*qk + j + 0 ] = x0*d + m;
|
| 206 |
-
y[i*qk + j + qk/2] = x1*d + m;
|
| 207 |
}
|
| 208 |
-
}
|
| 209 |
-
|
| 210 |
-
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
|
| 211 |
-
static const int qk = QK5_0;
|
| 212 |
-
|
| 213 |
-
const block_q5_0 * x = (const block_q5_0 *) vx;
|
| 214 |
-
|
| 215 |
-
const int i = blockIdx.x;
|
| 216 |
-
|
| 217 |
-
const float d = x[i].d;
|
| 218 |
-
|
| 219 |
-
uint32_t qh;
|
| 220 |
-
memcpy(&qh, x[i].qh, sizeof(qh));
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
|
| 227 |
-
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
|
| 228 |
-
|
| 229 |
-
y[i*qk + j + 0 ] = x0*d;
|
| 230 |
-
y[i*qk + j + qk/2] = x1*d;
|
| 231 |
-
}
|
| 232 |
-
}
|
| 233 |
-
|
| 234 |
-
static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
|
| 235 |
-
static const int qk = QK5_1;
|
| 236 |
-
|
| 237 |
-
const block_q5_1 * x = (const block_q5_1 *) vx;
|
| 238 |
-
|
| 239 |
-
const int i = blockIdx.x;
|
| 240 |
-
|
| 241 |
-
const float d = x[i].d;
|
| 242 |
-
const float m = x[i].m;
|
| 243 |
-
|
| 244 |
-
uint32_t qh;
|
| 245 |
-
memcpy(&qh, x[i].qh, sizeof(qh));
|
| 246 |
-
|
| 247 |
-
for (int j = 0; j < qk/2; ++j) {
|
| 248 |
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
| 249 |
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
| 250 |
-
|
| 251 |
-
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
|
| 252 |
-
const int x1 = (x[i].qs[j] >> 4) | xh_1;
|
| 253 |
-
|
| 254 |
-
y[i*qk + j + 0 ] = x0*d + m;
|
| 255 |
-
y[i*qk + j + qk/2] = x1*d + m;
|
| 256 |
-
}
|
| 257 |
-
}
|
| 258 |
-
|
| 259 |
-
static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
|
| 260 |
-
static const int qk = QK8_0;
|
| 261 |
-
|
| 262 |
-
const block_q8_0 * x = (const block_q8_0 *) vx;
|
| 263 |
-
|
| 264 |
-
const int i = blockIdx.x;
|
| 265 |
-
|
| 266 |
-
const float d = x[i].d;
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
|
|
|
| 271 |
}
|
| 272 |
|
| 273 |
template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|
@@ -308,29 +238,34 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
| 308 |
}
|
| 309 |
}
|
| 310 |
|
| 311 |
-
static void
|
| 312 |
-
const int
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
}
|
| 315 |
|
| 316 |
-
static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
| 317 |
-
const int
|
| 318 |
-
|
| 319 |
}
|
| 320 |
|
| 321 |
-
static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
| 322 |
-
const int
|
| 323 |
-
|
| 324 |
}
|
| 325 |
|
| 326 |
-
static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
| 327 |
-
const int
|
| 328 |
-
|
| 329 |
}
|
| 330 |
|
| 331 |
-
static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
| 332 |
-
const int
|
| 333 |
-
|
| 334 |
}
|
| 335 |
|
| 336 |
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
@@ -363,17 +298,9 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
|
|
| 363 |
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
|
| 364 |
}
|
| 365 |
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
const int i = blockIdx.x;
|
| 371 |
-
|
| 372 |
-
y[i] = __half2float(x[i]);
|
| 373 |
-
}
|
| 374 |
-
|
| 375 |
-
static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) {
|
| 376 |
-
convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
|
| 377 |
}
|
| 378 |
|
| 379 |
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
@@ -555,6 +482,67 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor
|
|
| 555 |
}
|
| 556 |
}
|
| 557 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 559 |
const int64_t ne00 = src0->ne[0];
|
| 560 |
const int64_t ne01 = src0->ne[1];
|
|
@@ -812,6 +800,11 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
|
|
| 812 |
ggml_cuda_pool_free(d_Q, q_size);
|
| 813 |
}
|
| 814 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 815 |
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
| 816 |
const int64_t ne10 = src1->ne[0];
|
| 817 |
|
|
@@ -885,14 +878,48 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
|
|
| 885 |
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
| 886 |
|
| 887 |
size_t q_size;
|
| 888 |
-
char *
|
| 889 |
|
| 890 |
cudaStream_t cudaStream2 = g_cudaStreams2[0];
|
| 891 |
|
| 892 |
// copy tensor to device
|
| 893 |
-
|
| 894 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 895 |
|
| 896 |
-
tensor->data =
|
| 897 |
tensor->backend = GGML_BACKEND_CUDA;
|
| 898 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
#define QK4_0 32
|
| 43 |
#define QR4_0 2
|
| 44 |
typedef struct {
|
| 45 |
+
half d; // delta
|
| 46 |
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
| 47 |
} block_q4_0;
|
| 48 |
+
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
| 49 |
|
| 50 |
#define QK4_1 32
|
| 51 |
#define QR4_1 2
|
| 52 |
typedef struct {
|
| 53 |
+
half d; // delta
|
| 54 |
+
half m; // min
|
| 55 |
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
| 56 |
} block_q4_1;
|
| 57 |
+
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
| 58 |
|
| 59 |
#define QK5_0 32
|
| 60 |
#define QR5_0 2
|
|
|
|
| 78 |
#define QK8_0 32
|
| 79 |
#define QR8_0 1
|
| 80 |
typedef struct {
|
| 81 |
+
half d; // delta
|
| 82 |
int8_t qs[QK8_0]; // quants
|
| 83 |
} block_q8_0;
|
| 84 |
+
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
| 85 |
|
| 86 |
+
#define CUDA_MUL_BLOCK_SIZE 256
|
| 87 |
+
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
| 88 |
+
#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec
|
| 89 |
+
|
| 90 |
+
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
| 91 |
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 92 |
+
|
| 93 |
+
if (i >= kx) {
|
| 94 |
+
return;
|
| 95 |
+
}
|
| 96 |
+
dst[i] = x[i] * y[i%ky];
|
| 97 |
+
}
|
| 98 |
|
| 99 |
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
| 100 |
const block_q4_0 * x = (const block_q4_0 *) vx;
|
|
|
|
| 181 |
v1 = __half2float(x[ib + 1]);
|
| 182 |
}
|
| 183 |
|
| 184 |
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
| 185 |
+
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
| 186 |
+
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
+
if (i >= k) {
|
| 189 |
+
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
+
const int ib = i/qk; // block index
|
| 193 |
+
const int iqs = (i%qk)/qr; // quant index
|
| 194 |
+
const int iybs = i - i%qk; // y block start index
|
| 195 |
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
// dequantize
|
| 198 |
+
float & v0 = y[iybs + iqs + 0];
|
| 199 |
+
float & v1 = y[iybs + iqs + y_offset];
|
| 200 |
+
dequantize_kernel(vx, ib, iqs, v0, v1);
|
| 201 |
}
|
| 202 |
|
| 203 |
template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|
|
|
| 238 |
}
|
| 239 |
}
|
| 240 |
|
| 241 |
+
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
| 242 |
+
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
| 243 |
+
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
| 247 |
+
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 248 |
+
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 249 |
}
|
| 250 |
|
| 251 |
+
static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
| 252 |
+
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 253 |
+
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 254 |
}
|
| 255 |
|
| 256 |
+
static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
| 257 |
+
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 258 |
+
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 259 |
}
|
| 260 |
|
| 261 |
+
static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
| 262 |
+
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 263 |
+
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 264 |
}
|
| 265 |
|
| 266 |
+
static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
| 267 |
+
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 268 |
+
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 269 |
}
|
| 270 |
|
| 271 |
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
|
|
| 298 |
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
|
| 299 |
}
|
| 300 |
|
| 301 |
+
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
| 302 |
+
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 303 |
+
dequantize_block<32, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
}
|
| 305 |
|
| 306 |
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
|
|
| 482 |
}
|
| 483 |
}
|
| 484 |
|
| 485 |
+
static void ggml_cuda_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 486 |
+
GGML_ASSERT(src1->backend == GGML_BACKEND_CUDA);
|
| 487 |
+
const int64_t ne00 = src0->ne[0];
|
| 488 |
+
const int64_t ne01 = src0->ne[1];
|
| 489 |
+
const int64_t ne02 = src0->ne[2];
|
| 490 |
+
const int64_t ne03 = src0->ne[2];
|
| 491 |
+
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
| 492 |
+
const int64_t ne10 = src1->ne[0];
|
| 493 |
+
const int64_t ne11 = src1->ne[1];
|
| 494 |
+
const int64_t ne12 = src1->ne[2];
|
| 495 |
+
const int64_t ne13 = src1->ne[3];
|
| 496 |
+
const int nb2 = dst->nb[2];
|
| 497 |
+
const int nb3 = dst->nb[3];
|
| 498 |
+
size_t x_size, d_size;
|
| 499 |
+
|
| 500 |
+
float * d_X = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
| 501 |
+
float * d_Y = (float *) src1->data; // src1 is already on device, broadcasted.
|
| 502 |
+
float * d_D = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
| 503 |
+
|
| 504 |
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 505 |
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
| 506 |
+
const int i0 = i03*ne02 + i02;
|
| 507 |
+
float * c_X2 = d_X + i0*ne01*ne00;
|
| 508 |
+
float * c_D2 = d_D + i0*ne01*ne00;
|
| 509 |
+
|
| 510 |
+
cudaStream_t cudaStream = g_cudaStreams[i0 % GGML_CUDA_MAX_STREAMS];
|
| 511 |
+
cudaStream_t cudaStream2 = g_cudaStreams2[i0 % GGML_CUDA_MAX_STREAMS];
|
| 512 |
+
cudaEvent_t cudaEvent = g_cudaEvents[i0 % GGML_CUDA_MAX_EVENTS];
|
| 513 |
+
|
| 514 |
+
// copy src0 to device
|
| 515 |
+
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X2, src0, i03, i02, cudaStream2));
|
| 516 |
+
CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
|
| 517 |
+
|
| 518 |
+
// wait for data
|
| 519 |
+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
|
| 520 |
+
|
| 521 |
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
| 522 |
+
const int64_t i13 = i03%ne13;
|
| 523 |
+
const int64_t i12 = i02%ne12;
|
| 524 |
+
const int64_t i11 = i01%ne11;
|
| 525 |
+
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
| 526 |
+
|
| 527 |
+
float * c_X1 = c_X2 + i01*ne00;
|
| 528 |
+
float * c_Y = d_Y + i1*ne10;
|
| 529 |
+
float * c_D1 = c_D2 + i01*ne00;
|
| 530 |
+
|
| 531 |
+
// compute
|
| 532 |
+
mul_f32_cuda(c_X1, c_Y, c_D1, ne00, ne10, cudaStream);
|
| 533 |
+
CUDA_CHECK(cudaGetLastError());
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
// copy dst to host
|
| 537 |
+
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
| 538 |
+
CUDA_CHECK(cudaMemcpyAsync(d, c_D2, sizeof(float)*ne00*ne01, cudaMemcpyDeviceToHost, cudaStream));
|
| 539 |
+
}
|
| 540 |
+
}
|
| 541 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 542 |
+
ggml_cuda_pool_free(d_X, x_size);
|
| 543 |
+
ggml_cuda_pool_free(d_D, d_size);
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 547 |
const int64_t ne00 = src0->ne[0];
|
| 548 |
const int64_t ne01 = src0->ne[1];
|
|
|
|
| 800 |
ggml_cuda_pool_free(d_Q, q_size);
|
| 801 |
}
|
| 802 |
|
| 803 |
+
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
| 804 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
| 805 |
+
ggml_cuda_mul_f32(src0, src1, dst);
|
| 806 |
+
}
|
| 807 |
+
|
| 808 |
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
| 809 |
const int64_t ne10 = src1->ne[0];
|
| 810 |
|
|
|
|
| 878 |
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
| 879 |
|
| 880 |
size_t q_size;
|
| 881 |
+
char * dst = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
|
| 882 |
|
| 883 |
cudaStream_t cudaStream2 = g_cudaStreams2[0];
|
| 884 |
|
| 885 |
// copy tensor to device
|
| 886 |
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
| 887 |
+
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
| 888 |
+
int i = i3*ne2 + i2;
|
| 889 |
+
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(dst + i*ne0*ne1, tensor, i3, i2, cudaStream2));
|
| 890 |
+
}
|
| 891 |
+
}
|
| 892 |
|
| 893 |
+
tensor->data = dst;
|
| 894 |
tensor->backend = GGML_BACKEND_CUDA;
|
| 895 |
}
|
| 896 |
+
|
| 897 |
+
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
| 898 |
+
FILE * fp = fopen(fname, "rb");
|
| 899 |
+
|
| 900 |
+
const size_t size = ggml_nbytes(tensor);
|
| 901 |
+
|
| 902 |
+
void * buf;
|
| 903 |
+
CUDA_CHECK(cudaMalloc(&buf, size));
|
| 904 |
+
void * buf_host = malloc(size);
|
| 905 |
+
|
| 906 |
+
#ifdef _WIN32
|
| 907 |
+
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
|
| 908 |
+
#else
|
| 909 |
+
int ret = fseek(fp, (long) offset, SEEK_SET);
|
| 910 |
+
#endif
|
| 911 |
+
GGML_ASSERT(ret == 0); // same
|
| 912 |
+
|
| 913 |
+
size_t ret2 = fread(buf_host, size, 1, fp);
|
| 914 |
+
if (ret2 != 1) {
|
| 915 |
+
fprintf(stderr, "unexpectedly reached end of file");
|
| 916 |
+
exit(1);
|
| 917 |
+
}
|
| 918 |
+
|
| 919 |
+
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
| 920 |
+
cudaDeviceSynchronize();
|
| 921 |
+
|
| 922 |
+
tensor->data = buf;
|
| 923 |
+
free(buf_host);
|
| 924 |
+
fclose(fp);
|
| 925 |
+
}
|
ggml-cuda.h
CHANGED
|
@@ -6,6 +6,7 @@ extern "C" {
|
|
| 6 |
|
| 7 |
void ggml_init_cublas(void);
|
| 8 |
|
|
|
|
| 9 |
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 10 |
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 11 |
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
|
@@ -15,6 +16,7 @@ void * ggml_cuda_host_malloc(size_t size);
|
|
| 15 |
void ggml_cuda_host_free(void * ptr);
|
| 16 |
|
| 17 |
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
|
|
|
|
| 18 |
|
| 19 |
#ifdef __cplusplus
|
| 20 |
}
|
|
|
|
| 6 |
|
| 7 |
void ggml_init_cublas(void);
|
| 8 |
|
| 9 |
+
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 10 |
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 11 |
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 12 |
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
|
|
|
| 16 |
void ggml_cuda_host_free(void * ptr);
|
| 17 |
|
| 18 |
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
|
| 19 |
+
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
|
| 20 |
|
| 21 |
#ifdef __cplusplus
|
| 22 |
}
|
ggml.c
CHANGED
|
@@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
|
|
| 512 |
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
| 513 |
}
|
| 514 |
|
| 515 |
-
#if __AVX2__ || __AVX512F__
|
| 516 |
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
| 517 |
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
| 518 |
uint32_t x32;
|
|
@@ -543,12 +543,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
| 543 |
return _mm256_cvtepi32_ps(summed_pairs);
|
| 544 |
}
|
| 545 |
|
| 546 |
-
|
| 547 |
-
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
| 548 |
-
// Get absolute values of x vectors
|
| 549 |
-
const __m256i ax = _mm256_sign_epi8(x, x);
|
| 550 |
-
// Sign the values of the y vectors
|
| 551 |
-
const __m256i sy = _mm256_sign_epi8(y, x);
|
| 552 |
#if __AVXVNNI__
|
| 553 |
const __m256i zero = _mm256_setzero_si256();
|
| 554 |
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
|
@@ -560,6 +555,21 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
|
| 560 |
#endif
|
| 561 |
}
|
| 562 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
static inline __m128i packNibbles( __m256i bytes )
|
| 564 |
{
|
| 565 |
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
|
@@ -619,6 +629,17 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
|
|
| 619 |
return _mm256_cvtepi32_ps(summed_pairs);
|
| 620 |
}
|
| 621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
// multiply int8_t, add results pairwise twice and return as float vector
|
| 623 |
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
| 624 |
const __m128i xl = _mm256_castsi256_si128(x);
|
|
@@ -667,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
| 667 |
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
| 668 |
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
| 669 |
|
| 670 |
-
#if __ARM_NEON
|
| 671 |
|
| 672 |
#if !defined(__aarch64__)
|
| 673 |
|
|
@@ -748,18 +769,18 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
| 748 |
|
| 749 |
#define QK4_0 32
|
| 750 |
typedef struct {
|
| 751 |
-
|
| 752 |
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
| 753 |
} block_q4_0;
|
| 754 |
-
static_assert(sizeof(block_q4_0) == sizeof(
|
| 755 |
|
| 756 |
#define QK4_1 32
|
| 757 |
typedef struct {
|
| 758 |
-
|
| 759 |
-
|
| 760 |
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
| 761 |
} block_q4_1;
|
| 762 |
-
static_assert(sizeof(block_q4_1) == 2 * sizeof(
|
| 763 |
|
| 764 |
#define QK5_0 32
|
| 765 |
typedef struct {
|
|
@@ -780,16 +801,16 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
| 780 |
|
| 781 |
#define QK8_0 32
|
| 782 |
typedef struct {
|
| 783 |
-
|
| 784 |
-
int8_t qs[QK8_0];
|
| 785 |
} block_q8_0;
|
| 786 |
-
static_assert(sizeof(block_q8_0) == sizeof(
|
| 787 |
|
| 788 |
#define QK8_1 32
|
| 789 |
typedef struct {
|
| 790 |
-
float
|
| 791 |
-
float
|
| 792 |
-
int8_t qs[QK8_1];
|
| 793 |
} block_q8_1;
|
| 794 |
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
| 795 |
|
|
@@ -816,7 +837,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
|
| 816 |
const float d = max / -8;
|
| 817 |
const float id = d ? 1.0f/d : 0.0f;
|
| 818 |
|
| 819 |
-
y[i].d = d;
|
| 820 |
|
| 821 |
for (int j = 0; j < qk/2; ++j) {
|
| 822 |
const float x0 = x[i*qk + 0 + j]*id;
|
|
@@ -856,8 +877,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
|
|
| 856 |
const float d = (max - min) / ((1 << 4) - 1);
|
| 857 |
const float id = d ? 1.0f/d : 0.0f;
|
| 858 |
|
| 859 |
-
y[i].d = d;
|
| 860 |
-
y[i].m = min;
|
| 861 |
|
| 862 |
for (int j = 0; j < qk/2; ++j) {
|
| 863 |
const float x0 = (x[i*qk + 0 + j] - min)*id;
|
|
@@ -988,7 +1009,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
|
|
| 988 |
const float d = amax / ((1 << 7) - 1);
|
| 989 |
const float id = d ? 1.0f/d : 0.0f;
|
| 990 |
|
| 991 |
-
y[i].d = d;
|
| 992 |
|
| 993 |
for (int j = 0; j < QK8_0; ++j) {
|
| 994 |
const float x0 = x[i*QK8_0 + j]*id;
|
|
@@ -1023,7 +1044,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
| 1023 |
const float d = amax / ((1 << 7) - 1);
|
| 1024 |
const float id = d ? 1.0f/d : 0.0f;
|
| 1025 |
|
| 1026 |
-
y[i].d = d;
|
| 1027 |
|
| 1028 |
for (int j = 0; j < 8; j++) {
|
| 1029 |
const float32x4_t v = vmulq_n_f32(srcv[j], id);
|
|
@@ -1058,7 +1079,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
| 1058 |
|
| 1059 |
// Quantize these floats
|
| 1060 |
const float d = maxScalar / 127.f;
|
| 1061 |
-
y[i].d = d;
|
| 1062 |
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
| 1063 |
const __m256 mul = _mm256_set1_ps( id );
|
| 1064 |
|
|
@@ -1157,7 +1178,7 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
|
|
| 1157 |
sum += y[i].qs[QK8_1/2 + j];
|
| 1158 |
}
|
| 1159 |
|
| 1160 |
-
y[i].s = d
|
| 1161 |
}
|
| 1162 |
}
|
| 1163 |
|
|
@@ -1309,7 +1330,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
|
|
| 1309 |
const int nb = k / qk;
|
| 1310 |
|
| 1311 |
for (int i = 0; i < nb; i++) {
|
| 1312 |
-
const float d = x[i].d;
|
| 1313 |
|
| 1314 |
for (int j = 0; j < qk/2; ++j) {
|
| 1315 |
const int x0 = (x[i].qs[j] & 0x0F) - 8;
|
|
@@ -1329,8 +1350,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
|
|
| 1329 |
const int nb = k / qk;
|
| 1330 |
|
| 1331 |
for (int i = 0; i < nb; i++) {
|
| 1332 |
-
const float d = x[i].d;
|
| 1333 |
-
const float m = x[i].m;
|
| 1334 |
|
| 1335 |
for (int j = 0; j < qk/2; ++j) {
|
| 1336 |
const int x0 = (x[i].qs[j] & 0x0F);
|
|
@@ -1405,7 +1426,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
|
|
| 1405 |
const block_q8_0 * restrict x = vx;
|
| 1406 |
|
| 1407 |
for (int i = 0; i < nb; i++) {
|
| 1408 |
-
const float d = x[i].d;
|
| 1409 |
|
| 1410 |
for (int j = 0; j < qk; ++j) {
|
| 1411 |
y[i*qk + j] = x[i].qs[j]*d;
|
|
@@ -1669,8 +1690,9 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
| 1669 |
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
| 1670 |
float tmp[8];
|
| 1671 |
|
| 1672 |
-
for (int i = 0; i < 8; i++)
|
| 1673 |
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
|
|
|
| 1674 |
|
| 1675 |
return _mm256_loadu_ps(tmp);
|
| 1676 |
}
|
|
@@ -2090,8 +2112,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2090 |
const block_q8_0 * restrict y0 = &y[i + 0];
|
| 2091 |
const block_q8_0 * restrict y1 = &y[i + 1];
|
| 2092 |
|
| 2093 |
-
const uint8x16_t m4b
|
| 2094 |
-
const int8x16_t s8b
|
| 2095 |
|
| 2096 |
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
| 2097 |
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
|
@@ -2119,8 +2141,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2119 |
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
| 2120 |
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
| 2121 |
|
| 2122 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
|
| 2123 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
|
| 2124 |
#else
|
| 2125 |
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
|
| 2126 |
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
|
|
@@ -2137,8 +2159,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2137 |
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2138 |
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2139 |
|
| 2140 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
|
| 2141 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
|
| 2142 |
#endif
|
| 2143 |
}
|
| 2144 |
|
|
@@ -2150,7 +2172,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2150 |
// Main loop
|
| 2151 |
for (int i = 0; i < nb; ++i) {
|
| 2152 |
/* Compute combined scale for the block */
|
| 2153 |
-
const __m256 d =
|
| 2154 |
|
| 2155 |
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
| 2156 |
|
|
@@ -2174,7 +2196,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2174 |
// Main loop
|
| 2175 |
for (int i = 0; i < nb; ++i) {
|
| 2176 |
// Compute combined scale for the block
|
| 2177 |
-
const __m256 d =
|
| 2178 |
|
| 2179 |
const __m128i lowMask = _mm_set1_epi8(0xF);
|
| 2180 |
const __m128i off = _mm_set1_epi8(8);
|
|
@@ -2216,7 +2238,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2216 |
_mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
|
| 2217 |
|
| 2218 |
// Compute combined scale for the block 0 and 1
|
| 2219 |
-
const __m128 d_0_1 =
|
| 2220 |
|
| 2221 |
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
| 2222 |
|
|
@@ -2234,7 +2256,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2234 |
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
|
| 2235 |
|
| 2236 |
// Compute combined scale for the block 2 and 3
|
| 2237 |
-
const __m128 d_2_3 =
|
| 2238 |
|
| 2239 |
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
| 2240 |
|
|
@@ -2267,7 +2289,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2267 |
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
| 2268 |
|
| 2269 |
// Compute combined scale for the block 0 and 1
|
| 2270 |
-
const __m128 d_0_1 =
|
| 2271 |
|
| 2272 |
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
|
| 2273 |
|
|
@@ -2285,7 +2307,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2285 |
_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
| 2286 |
|
| 2287 |
// Compute combined scale for the block 2 and 3
|
| 2288 |
-
const __m128 d_2_3 =
|
| 2289 |
|
| 2290 |
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
|
| 2291 |
|
|
@@ -2333,7 +2355,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2333 |
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
| 2334 |
}
|
| 2335 |
|
| 2336 |
-
sumf += (x[i].d*y[i].d)
|
| 2337 |
}
|
| 2338 |
|
| 2339 |
*s = sumf;
|
|
@@ -2363,7 +2385,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2363 |
const block_q8_1 * restrict y0 = &y[i + 0];
|
| 2364 |
const block_q8_1 * restrict y1 = &y[i + 1];
|
| 2365 |
|
| 2366 |
-
summs += x0->m * y0->s + x1->m * y1->s;
|
| 2367 |
|
| 2368 |
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
| 2369 |
|
|
@@ -2387,8 +2409,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2387 |
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
| 2388 |
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
| 2389 |
|
| 2390 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
|
| 2391 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
|
| 2392 |
#else
|
| 2393 |
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
|
| 2394 |
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
|
|
@@ -2405,8 +2427,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2405 |
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2406 |
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2407 |
|
| 2408 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
|
| 2409 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
|
| 2410 |
#endif
|
| 2411 |
}
|
| 2412 |
|
|
@@ -2419,13 +2441,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2419 |
|
| 2420 |
// Main loop
|
| 2421 |
for (int i = 0; i < nb; ++i) {
|
| 2422 |
-
const float
|
| 2423 |
-
const float
|
| 2424 |
|
| 2425 |
-
summs += x[i].m * y[i].s;
|
| 2426 |
|
| 2427 |
-
const __m256 d0v =
|
| 2428 |
-
const __m256 d1v =
|
| 2429 |
|
| 2430 |
// Compute combined scales
|
| 2431 |
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
|
@@ -2434,7 +2456,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2434 |
const __m256i bx = bytes_from_nibbles_32(x[i].qs);
|
| 2435 |
const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
| 2436 |
|
| 2437 |
-
const __m256 xy =
|
| 2438 |
|
| 2439 |
// Accumulate d0*d1*x*y
|
| 2440 |
#if defined(__AVX2__)
|
|
@@ -2459,7 +2481,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2459 |
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
| 2460 |
}
|
| 2461 |
|
| 2462 |
-
sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s;
|
| 2463 |
}
|
| 2464 |
|
| 2465 |
*s = sumf;
|
|
@@ -2535,16 +2557,13 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2535 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 2536 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 2537 |
|
| 2538 |
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
| 2539 |
-
const float x1d = GGML_FP16_TO_FP32(x1->d);
|
| 2540 |
-
|
| 2541 |
#if defined(__ARM_FEATURE_DOTPROD)
|
| 2542 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 2543 |
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
| 2544 |
-
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))),
|
| 2545 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 2546 |
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
| 2547 |
-
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))),
|
| 2548 |
#else
|
| 2549 |
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
| 2550 |
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
|
@@ -2561,8 +2580,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2561 |
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2562 |
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2563 |
|
| 2564 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
|
| 2565 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
|
| 2566 |
#endif
|
| 2567 |
}
|
| 2568 |
|
|
@@ -2637,7 +2656,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2637 |
// Main loop
|
| 2638 |
for (int i = 0; i < nb; i++) {
|
| 2639 |
/* Compute combined scale for the block */
|
| 2640 |
-
const __m256 d =
|
| 2641 |
|
| 2642 |
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
| 2643 |
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
|
@@ -2661,7 +2680,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2661 |
// Main loop
|
| 2662 |
for (int i = 0; i < nb; i++) {
|
| 2663 |
/* Compute combined scale for the block */
|
| 2664 |
-
const __m256 d =
|
| 2665 |
|
| 2666 |
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
| 2667 |
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
|
@@ -2704,7 +2723,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
| 2704 |
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
|
| 2705 |
}
|
| 2706 |
|
| 2707 |
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
|
| 2708 |
}
|
| 2709 |
|
| 2710 |
*s = sumf;
|
|
@@ -2786,16 +2805,13 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2786 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 2787 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 2788 |
|
| 2789 |
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
| 2790 |
-
const float x1d = GGML_FP16_TO_FP32(x1->d);
|
| 2791 |
-
|
| 2792 |
#if defined(__ARM_FEATURE_DOTPROD)
|
| 2793 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 2794 |
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
| 2795 |
-
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))),
|
| 2796 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 2797 |
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
| 2798 |
-
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))),
|
| 2799 |
#else
|
| 2800 |
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
| 2801 |
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
|
@@ -2812,8 +2828,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2812 |
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2813 |
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2814 |
|
| 2815 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
|
| 2816 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
|
| 2817 |
#endif
|
| 2818 |
}
|
| 2819 |
|
|
@@ -2873,15 +2889,14 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2873 |
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
| 2874 |
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
| 2875 |
|
| 2876 |
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
| 2877 |
-
|
| 2878 |
// dot product
|
| 2879 |
-
sumv = wasm_f32x4_add(sumv,
|
| 2880 |
-
|
| 2881 |
-
|
| 2882 |
-
|
| 2883 |
-
|
| 2884 |
-
|
|
|
|
| 2885 |
}
|
| 2886 |
|
| 2887 |
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
@@ -2903,10 +2918,10 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2903 |
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
| 2904 |
bx = _mm256_or_si256(bx, bxhi);
|
| 2905 |
|
| 2906 |
-
const __m256 dy =
|
| 2907 |
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 2908 |
|
| 2909 |
-
const __m256 q =
|
| 2910 |
|
| 2911 |
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
| 2912 |
}
|
|
@@ -2937,10 +2952,10 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
| 2937 |
bxh = _mm_or_si128(bxh, bxhih);
|
| 2938 |
bx = _mm256_set_m128i(bxh, bxl);
|
| 2939 |
|
| 2940 |
-
const __m256 dy =
|
| 2941 |
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 2942 |
|
| 2943 |
-
const __m256 q =
|
| 2944 |
|
| 2945 |
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
| 2946 |
}
|
|
@@ -3007,11 +3022,11 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
| 3007 |
#if defined(__ARM_FEATURE_DOTPROD)
|
| 3008 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 3009 |
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
| 3010 |
-
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
|
| 3011 |
|
| 3012 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 3013 |
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
| 3014 |
-
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
|
| 3015 |
|
| 3016 |
#else
|
| 3017 |
const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
|
|
@@ -3029,8 +3044,8 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
| 3029 |
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
|
| 3030 |
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
|
| 3031 |
|
| 3032 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
|
| 3033 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
|
| 3034 |
#endif
|
| 3035 |
}
|
| 3036 |
|
|
@@ -3042,7 +3057,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
| 3042 |
// Main loop
|
| 3043 |
for (int i = 0; i < nb; ++i) {
|
| 3044 |
// Compute combined scale for the block
|
| 3045 |
-
const __m256 d =
|
| 3046 |
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
| 3047 |
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 3048 |
|
|
@@ -3068,7 +3083,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
| 3068 |
sumi += x[i].qs[j]*y[i].qs[j];
|
| 3069 |
}
|
| 3070 |
|
| 3071 |
-
sumf += (x[i].d*y[i].d)
|
| 3072 |
}
|
| 3073 |
|
| 3074 |
*s = sumf;
|
|
@@ -3457,6 +3472,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
| 3457 |
"ROPE",
|
| 3458 |
"ROPE_BACK",
|
| 3459 |
"ALIBI",
|
|
|
|
| 3460 |
"CONV_1D_1S",
|
| 3461 |
"CONV_1D_2S",
|
| 3462 |
|
|
@@ -3467,7 +3483,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
| 3467 |
"MAP_BINARY",
|
| 3468 |
};
|
| 3469 |
|
| 3470 |
-
static_assert(GGML_OP_COUNT ==
|
|
|
|
| 3471 |
|
| 3472 |
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
| 3473 |
"none",
|
|
@@ -3517,6 +3534,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
| 3517 |
"rope(x)",
|
| 3518 |
"rope_back(x)",
|
| 3519 |
"alibi(x)",
|
|
|
|
| 3520 |
"conv_1d_1s(x)",
|
| 3521 |
"conv_1d_2s(x)",
|
| 3522 |
|
|
@@ -3527,7 +3545,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
| 3527 |
"f(x,y)",
|
| 3528 |
};
|
| 3529 |
|
| 3530 |
-
static_assert(GGML_OP_COUNT ==
|
| 3531 |
|
| 3532 |
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
| 3533 |
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
|
@@ -3761,6 +3779,12 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g
|
|
| 3761 |
(t1->ne[3]%t0->ne[3] == 0);
|
| 3762 |
}
|
| 3763 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3764 |
static inline int ggml_up32(int n) {
|
| 3765 |
return (n + 31) & ~31;
|
| 3766 |
}
|
|
@@ -4643,11 +4667,15 @@ struct ggml_tensor * ggml_mul_impl(
|
|
| 4643 |
struct ggml_tensor * a,
|
| 4644 |
struct ggml_tensor * b,
|
| 4645 |
bool inplace) {
|
| 4646 |
-
|
|
|
|
|
|
|
| 4647 |
|
| 4648 |
bool is_node = false;
|
| 4649 |
|
| 4650 |
if (!inplace && (a->grad || b->grad)) {
|
|
|
|
|
|
|
| 4651 |
is_node = true;
|
| 4652 |
}
|
| 4653 |
|
|
@@ -6189,7 +6217,8 @@ struct ggml_tensor * ggml_alibi(
|
|
| 6189 |
struct ggml_context * ctx,
|
| 6190 |
struct ggml_tensor * a,
|
| 6191 |
int n_past,
|
| 6192 |
-
int n_head
|
|
|
|
| 6193 |
GGML_ASSERT(n_past >= 0);
|
| 6194 |
bool is_node = false;
|
| 6195 |
|
|
@@ -6208,6 +6237,8 @@ struct ggml_tensor * ggml_alibi(
|
|
| 6208 |
|
| 6209 |
((int32_t *) b->data)[0] = n_past;
|
| 6210 |
((int32_t *) b->data)[1] = n_head;
|
|
|
|
|
|
|
| 6211 |
|
| 6212 |
ggml_scratch_load(ctx);
|
| 6213 |
|
|
@@ -6219,6 +6250,40 @@ struct ggml_tensor * ggml_alibi(
|
|
| 6219 |
return result;
|
| 6220 |
}
|
| 6221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6222 |
// ggml_conv_1d_1s
|
| 6223 |
|
| 6224 |
struct ggml_tensor * ggml_conv_1d_1s(
|
|
@@ -7945,7 +8010,7 @@ static void ggml_compute_forward_mul_f32(
|
|
| 7945 |
const struct ggml_tensor * src0,
|
| 7946 |
const struct ggml_tensor * src1,
|
| 7947 |
struct ggml_tensor * dst) {
|
| 7948 |
-
|
| 7949 |
|
| 7950 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 7951 |
return;
|
|
@@ -7953,10 +8018,25 @@ static void ggml_compute_forward_mul_f32(
|
|
| 7953 |
const int ith = params->ith;
|
| 7954 |
const int nth = params->nth;
|
| 7955 |
|
| 7956 |
-
|
| 7957 |
-
|
| 7958 |
-
|
| 7959 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7960 |
|
| 7961 |
const size_t nb00 = src0->nb[0];
|
| 7962 |
const size_t nb01 = src0->nb[1];
|
|
@@ -7975,44 +8055,51 @@ static void ggml_compute_forward_mul_f32(
|
|
| 7975 |
|
| 7976 |
GGML_ASSERT( nb0 == sizeof(float));
|
| 7977 |
GGML_ASSERT(nb00 == sizeof(float));
|
|
|
|
| 7978 |
|
| 7979 |
if (nb10 == sizeof(float)) {
|
| 7980 |
-
for (
|
| 7981 |
-
// src0
|
| 7982 |
-
const
|
| 7983 |
-
const
|
| 7984 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7985 |
|
|
|
|
|
|
|
|
|
|
| 7986 |
|
| 7987 |
#ifdef GGML_USE_ACCELERATE
|
| 7988 |
UNUSED(ggml_vec_mul_f32);
|
| 7989 |
|
| 7990 |
-
vDSP_vmul(
|
| 7991 |
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
| 7992 |
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
| 7993 |
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
| 7994 |
-
ne0);
|
| 7995 |
#else
|
| 7996 |
-
ggml_vec_mul_f32(
|
| 7997 |
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
| 7998 |
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
| 7999 |
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
| 8000 |
#endif
|
| 8001 |
// }
|
| 8002 |
// }
|
| 8003 |
}
|
| 8004 |
} else {
|
| 8005 |
// src1 is not contiguous
|
| 8006 |
-
for (
|
| 8007 |
-
// src0
|
| 8008 |
-
|
| 8009 |
-
const
|
| 8010 |
-
const
|
|
|
|
| 8011 |
|
| 8012 |
-
|
| 8013 |
-
|
| 8014 |
-
|
| 8015 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8016 |
|
| 8017 |
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
| 8018 |
}
|
|
@@ -10501,34 +10588,29 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
| 10501 |
assert(src1->type == GGML_TYPE_I32);
|
| 10502 |
assert(ggml_nelements(src1) == 2);
|
| 10503 |
|
|
|
|
|
|
|
|
|
|
| 10504 |
const int n_past = ((int32_t *) src1->data)[0];
|
| 10505 |
const bool inplace = (bool)((int32_t *) src1->data)[1];
|
| 10506 |
|
| 10507 |
-
|
| 10508 |
-
// TODO: this hack is not good, need a better way to handle this
|
| 10509 |
-
if (!inplace) {
|
| 10510 |
-
// use the init task to copy src -> dst
|
| 10511 |
-
struct ggml_compute_params params_cpy = *params;
|
| 10512 |
-
|
| 10513 |
-
params_cpy.ith = 0;
|
| 10514 |
-
params_cpy.nth = 1;
|
| 10515 |
-
params_cpy.type = GGML_TASK_COMPUTE;
|
| 10516 |
-
|
| 10517 |
-
ggml_compute_forward_dup_same_cont(¶ms_cpy, src0, dst);
|
| 10518 |
-
}
|
| 10519 |
|
| 10520 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10521 |
}
|
| 10522 |
|
| 10523 |
-
if (params->type == GGML_TASK_FINALIZE) {
|
| 10524 |
return;
|
| 10525 |
}
|
| 10526 |
|
| 10527 |
-
const int ith = params->ith;
|
| 10528 |
-
const int nth = params->nth;
|
| 10529 |
-
|
| 10530 |
-
assert(n_past >= 0);
|
| 10531 |
-
|
| 10532 |
// TODO: handle transposed/permuted matrices
|
| 10533 |
|
| 10534 |
const int n = ggml_nrows(src0);
|
|
@@ -10682,14 +10764,15 @@ static void ggml_compute_forward_alibi_f32(
|
|
| 10682 |
struct ggml_tensor * dst) {
|
| 10683 |
assert(params->ith == 0);
|
| 10684 |
assert(src1->type == GGML_TYPE_I32);
|
| 10685 |
-
assert(ggml_nelements(src1) ==
|
| 10686 |
|
| 10687 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 10688 |
return;
|
| 10689 |
}
|
| 10690 |
|
| 10691 |
-
const int
|
| 10692 |
-
const int
|
|
|
|
| 10693 |
|
| 10694 |
assert(n_past >= 0);
|
| 10695 |
|
|
@@ -10712,8 +10795,8 @@ static void ggml_compute_forward_alibi_f32(
|
|
| 10712 |
// add alibi to src0 (KQ_scaled)
|
| 10713 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
| 10714 |
|
| 10715 |
-
const float m0 = powf(2.0f, -
|
| 10716 |
-
const float m1 = powf(2.0f, -
|
| 10717 |
|
| 10718 |
for (int i = 0; i < ne0; i++) {
|
| 10719 |
for (int j = 0; j < ne1; j++) {
|
|
@@ -10731,13 +10814,13 @@ static void ggml_compute_forward_alibi_f32(
|
|
| 10731 |
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
| 10732 |
}
|
| 10733 |
|
| 10734 |
-
pdst[0] = i * m_k + src[0];
|
|
|
|
| 10735 |
}
|
| 10736 |
}
|
| 10737 |
}
|
| 10738 |
}
|
| 10739 |
|
| 10740 |
-
|
| 10741 |
static void ggml_compute_forward_alibi_f16(
|
| 10742 |
const struct ggml_compute_params * params,
|
| 10743 |
const struct ggml_tensor * src0,
|
|
@@ -10745,14 +10828,15 @@ static void ggml_compute_forward_alibi_f16(
|
|
| 10745 |
struct ggml_tensor * dst) {
|
| 10746 |
assert(params->ith == 0);
|
| 10747 |
assert(src1->type == GGML_TYPE_I32);
|
| 10748 |
-
assert(ggml_nelements(src1) ==
|
| 10749 |
|
| 10750 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 10751 |
return;
|
| 10752 |
}
|
| 10753 |
|
| 10754 |
-
const int
|
| 10755 |
-
const int
|
|
|
|
| 10756 |
|
| 10757 |
assert(n_past >= 0);
|
| 10758 |
|
|
@@ -10775,8 +10859,8 @@ static void ggml_compute_forward_alibi_f16(
|
|
| 10775 |
// add alibi to src0 (KQ_scaled)
|
| 10776 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
| 10777 |
|
| 10778 |
-
const float m0 = powf(2.0f, -
|
| 10779 |
-
const float m1 = powf(2.0f, -
|
| 10780 |
|
| 10781 |
for (int i = 0; i < ne0; i++) {
|
| 10782 |
for (int j = 0; j < ne1; j++) {
|
|
@@ -10795,7 +10879,7 @@ static void ggml_compute_forward_alibi_f16(
|
|
| 10795 |
}
|
| 10796 |
|
| 10797 |
// we return F32
|
| 10798 |
-
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
| 10799 |
}
|
| 10800 |
}
|
| 10801 |
}
|
|
@@ -10831,6 +10915,77 @@ static void ggml_compute_forward_alibi(
|
|
| 10831 |
}
|
| 10832 |
}
|
| 10833 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10834 |
// ggml_compute_forward_rope
|
| 10835 |
|
| 10836 |
static void ggml_compute_forward_rope_f32(
|
|
@@ -12812,6 +12967,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
| 12812 |
{
|
| 12813 |
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
|
| 12814 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12815 |
case GGML_OP_CONV_1D_1S:
|
| 12816 |
{
|
| 12817 |
ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
|
|
@@ -13119,6 +13278,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
| 13119 |
{
|
| 13120 |
GGML_ASSERT(false); // TODO: not implemented
|
| 13121 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13122 |
case GGML_OP_SILU:
|
| 13123 |
{
|
| 13124 |
// necessary for llama
|
|
@@ -13998,6 +14161,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 13998 |
{
|
| 13999 |
node->n_tasks = 1; //TODO
|
| 14000 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14001 |
case GGML_OP_CONV_1D_1S:
|
| 14002 |
case GGML_OP_CONV_1D_2S:
|
| 14003 |
{
|
|
|
|
| 512 |
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
| 513 |
}
|
| 514 |
|
| 515 |
+
#if defined(__AVX2__) || defined(__AVX512F__)
|
| 516 |
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
| 517 |
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
| 518 |
uint32_t x32;
|
|
|
|
| 543 |
return _mm256_cvtepi32_ps(summed_pairs);
|
| 544 |
}
|
| 545 |
|
| 546 |
+
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 547 |
#if __AVXVNNI__
|
| 548 |
const __m256i zero = _mm256_setzero_si256();
|
| 549 |
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
|
|
|
| 555 |
#endif
|
| 556 |
}
|
| 557 |
|
| 558 |
+
// multiply int8_t, add results pairwise twice and return as float vector
|
| 559 |
+
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
| 560 |
+
#if __AVXVNNIINT8__
|
| 561 |
+
const __m256i zero = _mm256_setzero_si256();
|
| 562 |
+
const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
|
| 563 |
+
return _mm256_cvtepi32_ps(summed_pairs);
|
| 564 |
+
#else
|
| 565 |
+
// Get absolute values of x vectors
|
| 566 |
+
const __m256i ax = _mm256_sign_epi8(x, x);
|
| 567 |
+
// Sign the values of the y vectors
|
| 568 |
+
const __m256i sy = _mm256_sign_epi8(y, x);
|
| 569 |
+
return mul_sum_us8_pairs_float(ax, sy);
|
| 570 |
+
#endif
|
| 571 |
+
}
|
| 572 |
+
|
| 573 |
static inline __m128i packNibbles( __m256i bytes )
|
| 574 |
{
|
| 575 |
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
|
|
|
| 629 |
return _mm256_cvtepi32_ps(summed_pairs);
|
| 630 |
}
|
| 631 |
|
| 632 |
+
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
| 633 |
+
const __m128i axl = _mm256_castsi256_si128(ax);
|
| 634 |
+
const __m128i axh = _mm256_extractf128_si256(ax, 1);
|
| 635 |
+
const __m128i syl = _mm256_castsi256_si128(sy);
|
| 636 |
+
const __m128i syh = _mm256_extractf128_si256(sy, 1);
|
| 637 |
+
// Perform multiplication and create 16-bit values
|
| 638 |
+
const __m128i dotl = _mm_maddubs_epi16(axl, syl);
|
| 639 |
+
const __m128i doth = _mm_maddubs_epi16(axh, syh);
|
| 640 |
+
return sum_i16_pairs_float(doth, dotl);
|
| 641 |
+
}
|
| 642 |
+
|
| 643 |
// multiply int8_t, add results pairwise twice and return as float vector
|
| 644 |
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
| 645 |
const __m128i xl = _mm256_castsi256_si128(x);
|
|
|
|
| 688 |
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
| 689 |
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
| 690 |
|
| 691 |
+
#if defined(__ARM_NEON)
|
| 692 |
|
| 693 |
#if !defined(__aarch64__)
|
| 694 |
|
|
|
|
| 769 |
|
| 770 |
#define QK4_0 32
|
| 771 |
typedef struct {
|
| 772 |
+
ggml_fp16_t d; // delta
|
| 773 |
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
| 774 |
} block_q4_0;
|
| 775 |
+
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
| 776 |
|
| 777 |
#define QK4_1 32
|
| 778 |
typedef struct {
|
| 779 |
+
ggml_fp16_t d; // delta
|
| 780 |
+
ggml_fp16_t m; // min
|
| 781 |
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
| 782 |
} block_q4_1;
|
| 783 |
+
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
| 784 |
|
| 785 |
#define QK5_0 32
|
| 786 |
typedef struct {
|
|
|
|
| 801 |
|
| 802 |
#define QK8_0 32
|
| 803 |
typedef struct {
|
| 804 |
+
ggml_fp16_t d; // delta
|
| 805 |
+
int8_t qs[QK8_0]; // quants
|
| 806 |
} block_q8_0;
|
| 807 |
+
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
| 808 |
|
| 809 |
#define QK8_1 32
|
| 810 |
typedef struct {
|
| 811 |
+
float d; // delta
|
| 812 |
+
float s; // d * sum(qs[i])
|
| 813 |
+
int8_t qs[QK8_1]; // quants
|
| 814 |
} block_q8_1;
|
| 815 |
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
| 816 |
|
|
|
|
| 837 |
const float d = max / -8;
|
| 838 |
const float id = d ? 1.0f/d : 0.0f;
|
| 839 |
|
| 840 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 841 |
|
| 842 |
for (int j = 0; j < qk/2; ++j) {
|
| 843 |
const float x0 = x[i*qk + 0 + j]*id;
|
|
|
|
| 877 |
const float d = (max - min) / ((1 << 4) - 1);
|
| 878 |
const float id = d ? 1.0f/d : 0.0f;
|
| 879 |
|
| 880 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 881 |
+
y[i].m = GGML_FP32_TO_FP16(min);
|
| 882 |
|
| 883 |
for (int j = 0; j < qk/2; ++j) {
|
| 884 |
const float x0 = (x[i*qk + 0 + j] - min)*id;
|
|
|
|
| 1009 |
const float d = amax / ((1 << 7) - 1);
|
| 1010 |
const float id = d ? 1.0f/d : 0.0f;
|
| 1011 |
|
| 1012 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 1013 |
|
| 1014 |
for (int j = 0; j < QK8_0; ++j) {
|
| 1015 |
const float x0 = x[i*QK8_0 + j]*id;
|
|
|
|
| 1044 |
const float d = amax / ((1 << 7) - 1);
|
| 1045 |
const float id = d ? 1.0f/d : 0.0f;
|
| 1046 |
|
| 1047 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 1048 |
|
| 1049 |
for (int j = 0; j < 8; j++) {
|
| 1050 |
const float32x4_t v = vmulq_n_f32(srcv[j], id);
|
|
|
|
| 1079 |
|
| 1080 |
// Quantize these floats
|
| 1081 |
const float d = maxScalar / 127.f;
|
| 1082 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 1083 |
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
| 1084 |
const __m256 mul = _mm256_set1_ps( id );
|
| 1085 |
|
|
|
|
| 1178 |
sum += y[i].qs[QK8_1/2 + j];
|
| 1179 |
}
|
| 1180 |
|
| 1181 |
+
y[i].s = sum*d;
|
| 1182 |
}
|
| 1183 |
}
|
| 1184 |
|
|
|
|
| 1330 |
const int nb = k / qk;
|
| 1331 |
|
| 1332 |
for (int i = 0; i < nb; i++) {
|
| 1333 |
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
| 1334 |
|
| 1335 |
for (int j = 0; j < qk/2; ++j) {
|
| 1336 |
const int x0 = (x[i].qs[j] & 0x0F) - 8;
|
|
|
|
| 1350 |
const int nb = k / qk;
|
| 1351 |
|
| 1352 |
for (int i = 0; i < nb; i++) {
|
| 1353 |
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
| 1354 |
+
const float m = GGML_FP16_TO_FP32(x[i].m);
|
| 1355 |
|
| 1356 |
for (int j = 0; j < qk/2; ++j) {
|
| 1357 |
const int x0 = (x[i].qs[j] & 0x0F);
|
|
|
|
| 1426 |
const block_q8_0 * restrict x = vx;
|
| 1427 |
|
| 1428 |
for (int i = 0; i < nb; i++) {
|
| 1429 |
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
| 1430 |
|
| 1431 |
for (int j = 0; j < qk; ++j) {
|
| 1432 |
y[i*qk + j] = x[i].qs[j]*d;
|
|
|
|
| 1690 |
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
| 1691 |
float tmp[8];
|
| 1692 |
|
| 1693 |
+
for (int i = 0; i < 8; i++) {
|
| 1694 |
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
| 1695 |
+
}
|
| 1696 |
|
| 1697 |
return _mm256_loadu_ps(tmp);
|
| 1698 |
}
|
|
|
|
| 2112 |
const block_q8_0 * restrict y0 = &y[i + 0];
|
| 2113 |
const block_q8_0 * restrict y1 = &y[i + 1];
|
| 2114 |
|
| 2115 |
+
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
| 2116 |
+
const int8x16_t s8b = vdupq_n_s8(0x8);
|
| 2117 |
|
| 2118 |
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
| 2119 |
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
|
|
|
| 2141 |
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
| 2142 |
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
| 2143 |
|
| 2144 |
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 2145 |
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 2146 |
#else
|
| 2147 |
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
|
| 2148 |
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
|
|
|
|
| 2159 |
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2160 |
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2161 |
|
| 2162 |
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 2163 |
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 2164 |
#endif
|
| 2165 |
}
|
| 2166 |
|
|
|
|
| 2172 |
// Main loop
|
| 2173 |
for (int i = 0; i < nb; ++i) {
|
| 2174 |
/* Compute combined scale for the block */
|
| 2175 |
+
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
| 2176 |
|
| 2177 |
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
| 2178 |
|
|
|
|
| 2196 |
// Main loop
|
| 2197 |
for (int i = 0; i < nb; ++i) {
|
| 2198 |
// Compute combined scale for the block
|
| 2199 |
+
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
| 2200 |
|
| 2201 |
const __m128i lowMask = _mm_set1_epi8(0xF);
|
| 2202 |
const __m128i off = _mm_set1_epi8(8);
|
|
|
|
| 2238 |
_mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
|
| 2239 |
|
| 2240 |
// Compute combined scale for the block 0 and 1
|
| 2241 |
+
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
|
| 2242 |
|
| 2243 |
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
| 2244 |
|
|
|
|
| 2256 |
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
|
| 2257 |
|
| 2258 |
// Compute combined scale for the block 2 and 3
|
| 2259 |
+
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
|
| 2260 |
|
| 2261 |
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
| 2262 |
|
|
|
|
| 2289 |
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
| 2290 |
|
| 2291 |
// Compute combined scale for the block 0 and 1
|
| 2292 |
+
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
| 2293 |
|
| 2294 |
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
|
| 2295 |
|
|
|
|
| 2307 |
_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
| 2308 |
|
| 2309 |
// Compute combined scale for the block 2 and 3
|
| 2310 |
+
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
|
| 2311 |
|
| 2312 |
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
|
| 2313 |
|
|
|
|
| 2355 |
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
| 2356 |
}
|
| 2357 |
|
| 2358 |
+
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
|
| 2359 |
}
|
| 2360 |
|
| 2361 |
*s = sumf;
|
|
|
|
| 2385 |
const block_q8_1 * restrict y0 = &y[i + 0];
|
| 2386 |
const block_q8_1 * restrict y1 = &y[i + 1];
|
| 2387 |
|
| 2388 |
+
summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
|
| 2389 |
|
| 2390 |
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
| 2391 |
|
|
|
|
| 2409 |
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
| 2410 |
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
| 2411 |
|
| 2412 |
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 2413 |
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
| 2414 |
#else
|
| 2415 |
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
|
| 2416 |
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
|
|
|
|
| 2427 |
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2428 |
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2429 |
|
| 2430 |
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 2431 |
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
| 2432 |
#endif
|
| 2433 |
}
|
| 2434 |
|
|
|
|
| 2441 |
|
| 2442 |
// Main loop
|
| 2443 |
for (int i = 0; i < nb; ++i) {
|
| 2444 |
+
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
| 2445 |
+
const float d1 = y[i].d;
|
| 2446 |
|
| 2447 |
+
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
| 2448 |
|
| 2449 |
+
const __m256 d0v = _mm256_set1_ps( d0 );
|
| 2450 |
+
const __m256 d1v = _mm256_set1_ps( d1 );
|
| 2451 |
|
| 2452 |
// Compute combined scales
|
| 2453 |
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
|
|
|
| 2456 |
const __m256i bx = bytes_from_nibbles_32(x[i].qs);
|
| 2457 |
const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
| 2458 |
|
| 2459 |
+
const __m256 xy = mul_sum_us8_pairs_float(bx, by);
|
| 2460 |
|
| 2461 |
// Accumulate d0*d1*x*y
|
| 2462 |
#if defined(__AVX2__)
|
|
|
|
| 2481 |
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
| 2482 |
}
|
| 2483 |
|
| 2484 |
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
| 2485 |
}
|
| 2486 |
|
| 2487 |
*s = sumf;
|
|
|
|
| 2557 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 2558 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 2559 |
|
|
|
|
|
|
|
|
|
|
| 2560 |
#if defined(__ARM_FEATURE_DOTPROD)
|
| 2561 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 2562 |
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
| 2563 |
+
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 2564 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 2565 |
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
| 2566 |
+
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 2567 |
#else
|
| 2568 |
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
| 2569 |
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
|
|
|
| 2580 |
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2581 |
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2582 |
|
| 2583 |
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 2584 |
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 2585 |
#endif
|
| 2586 |
}
|
| 2587 |
|
|
|
|
| 2656 |
// Main loop
|
| 2657 |
for (int i = 0; i < nb; i++) {
|
| 2658 |
/* Compute combined scale for the block */
|
| 2659 |
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
| 2660 |
|
| 2661 |
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
| 2662 |
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
|
|
|
| 2680 |
// Main loop
|
| 2681 |
for (int i = 0; i < nb; i++) {
|
| 2682 |
/* Compute combined scale for the block */
|
| 2683 |
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
| 2684 |
|
| 2685 |
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
| 2686 |
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
|
|
|
| 2723 |
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
|
| 2724 |
}
|
| 2725 |
|
| 2726 |
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
|
| 2727 |
}
|
| 2728 |
|
| 2729 |
*s = sumf;
|
|
|
|
| 2805 |
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
| 2806 |
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
| 2807 |
|
|
|
|
|
|
|
|
|
|
| 2808 |
#if defined(__ARM_FEATURE_DOTPROD)
|
| 2809 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 2810 |
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
| 2811 |
+
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 2812 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 2813 |
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
| 2814 |
+
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
| 2815 |
#else
|
| 2816 |
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
| 2817 |
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
|
|
|
| 2828 |
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
| 2829 |
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
| 2830 |
|
| 2831 |
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 2832 |
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
| 2833 |
#endif
|
| 2834 |
}
|
| 2835 |
|
|
|
|
| 2889 |
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
| 2890 |
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
| 2891 |
|
|
|
|
|
|
|
| 2892 |
// dot product
|
| 2893 |
+
sumv = wasm_f32x4_add(sumv,
|
| 2894 |
+
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
|
| 2895 |
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
| 2896 |
+
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
| 2897 |
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
| 2898 |
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
| 2899 |
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d));
|
| 2900 |
}
|
| 2901 |
|
| 2902 |
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
|
|
| 2918 |
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
| 2919 |
bx = _mm256_or_si256(bx, bxhi);
|
| 2920 |
|
| 2921 |
+
const __m256 dy = _mm256_set1_ps(y[i].d);
|
| 2922 |
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 2923 |
|
| 2924 |
+
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
| 2925 |
|
| 2926 |
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
| 2927 |
}
|
|
|
|
| 2952 |
bxh = _mm_or_si128(bxh, bxhih);
|
| 2953 |
bx = _mm256_set_m128i(bxh, bxl);
|
| 2954 |
|
| 2955 |
+
const __m256 dy = _mm256_set1_ps(y[i].d);
|
| 2956 |
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 2957 |
|
| 2958 |
+
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
| 2959 |
|
| 2960 |
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
| 2961 |
}
|
|
|
|
| 3022 |
#if defined(__ARM_FEATURE_DOTPROD)
|
| 3023 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 3024 |
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
| 3025 |
+
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 3026 |
|
| 3027 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 3028 |
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
| 3029 |
+
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 3030 |
|
| 3031 |
#else
|
| 3032 |
const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
|
|
|
|
| 3044 |
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
|
| 3045 |
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
|
| 3046 |
|
| 3047 |
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 3048 |
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 3049 |
#endif
|
| 3050 |
}
|
| 3051 |
|
|
|
|
| 3057 |
// Main loop
|
| 3058 |
for (int i = 0; i < nb; ++i) {
|
| 3059 |
// Compute combined scale for the block
|
| 3060 |
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
| 3061 |
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
| 3062 |
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 3063 |
|
|
|
|
| 3083 |
sumi += x[i].qs[j]*y[i].qs[j];
|
| 3084 |
}
|
| 3085 |
|
| 3086 |
+
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
|
| 3087 |
}
|
| 3088 |
|
| 3089 |
*s = sumf;
|
|
|
|
| 3472 |
"ROPE",
|
| 3473 |
"ROPE_BACK",
|
| 3474 |
"ALIBI",
|
| 3475 |
+
"CLAMP",
|
| 3476 |
"CONV_1D_1S",
|
| 3477 |
"CONV_1D_2S",
|
| 3478 |
|
|
|
|
| 3483 |
"MAP_BINARY",
|
| 3484 |
};
|
| 3485 |
|
| 3486 |
+
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
|
| 3487 |
+
|
| 3488 |
|
| 3489 |
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
| 3490 |
"none",
|
|
|
|
| 3534 |
"rope(x)",
|
| 3535 |
"rope_back(x)",
|
| 3536 |
"alibi(x)",
|
| 3537 |
+
"clamp(x)",
|
| 3538 |
"conv_1d_1s(x)",
|
| 3539 |
"conv_1d_2s(x)",
|
| 3540 |
|
|
|
|
| 3545 |
"f(x,y)",
|
| 3546 |
};
|
| 3547 |
|
| 3548 |
+
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
|
| 3549 |
|
| 3550 |
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
| 3551 |
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
|
|
|
| 3779 |
(t1->ne[3]%t0->ne[3] == 0);
|
| 3780 |
}
|
| 3781 |
|
| 3782 |
+
static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
| 3783 |
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 3784 |
+
|
| 3785 |
+
return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
|
| 3786 |
+
}
|
| 3787 |
+
|
| 3788 |
static inline int ggml_up32(int n) {
|
| 3789 |
return (n + 31) & ~31;
|
| 3790 |
}
|
|
|
|
| 4667 |
struct ggml_tensor * a,
|
| 4668 |
struct ggml_tensor * b,
|
| 4669 |
bool inplace) {
|
| 4670 |
+
// TODO: support less-strict constraint
|
| 4671 |
+
// GGML_ASSERT(ggml_can_repeat(b, a));
|
| 4672 |
+
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
| 4673 |
|
| 4674 |
bool is_node = false;
|
| 4675 |
|
| 4676 |
if (!inplace && (a->grad || b->grad)) {
|
| 4677 |
+
// TODO: support backward pass for broadcasting
|
| 4678 |
+
GGML_ASSERT(ggml_are_same_shape(a, b));
|
| 4679 |
is_node = true;
|
| 4680 |
}
|
| 4681 |
|
|
|
|
| 6217 |
struct ggml_context * ctx,
|
| 6218 |
struct ggml_tensor * a,
|
| 6219 |
int n_past,
|
| 6220 |
+
int n_head,
|
| 6221 |
+
float bias_max) {
|
| 6222 |
GGML_ASSERT(n_past >= 0);
|
| 6223 |
bool is_node = false;
|
| 6224 |
|
|
|
|
| 6237 |
|
| 6238 |
((int32_t *) b->data)[0] = n_past;
|
| 6239 |
((int32_t *) b->data)[1] = n_head;
|
| 6240 |
+
GGML_ASSERT(sizeof(float) == sizeof(int32_t));
|
| 6241 |
+
(((float *) b->data)[2]) = bias_max;
|
| 6242 |
|
| 6243 |
ggml_scratch_load(ctx);
|
| 6244 |
|
|
|
|
| 6250 |
return result;
|
| 6251 |
}
|
| 6252 |
|
| 6253 |
+
// ggml_clamp
|
| 6254 |
+
|
| 6255 |
+
struct ggml_tensor * ggml_clamp(
|
| 6256 |
+
struct ggml_context * ctx,
|
| 6257 |
+
struct ggml_tensor * a,
|
| 6258 |
+
float min,
|
| 6259 |
+
float max) {
|
| 6260 |
+
bool is_node = false;
|
| 6261 |
+
|
| 6262 |
+
if (a->grad) {
|
| 6263 |
+
GGML_ASSERT(false); // TODO: implement backward
|
| 6264 |
+
is_node = true;
|
| 6265 |
+
}
|
| 6266 |
+
|
| 6267 |
+
// TODO: when implement backward, fix this:
|
| 6268 |
+
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
| 6269 |
+
|
| 6270 |
+
ggml_scratch_save(ctx);
|
| 6271 |
+
|
| 6272 |
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
| 6273 |
+
|
| 6274 |
+
((float *) b->data)[0] = min;
|
| 6275 |
+
((float *) b->data)[1] = max;
|
| 6276 |
+
|
| 6277 |
+
ggml_scratch_load(ctx);
|
| 6278 |
+
|
| 6279 |
+
result->op = GGML_OP_CLAMP;
|
| 6280 |
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 6281 |
+
result->src0 = a;
|
| 6282 |
+
result->src1 = b;
|
| 6283 |
+
|
| 6284 |
+
return result;
|
| 6285 |
+
}
|
| 6286 |
+
|
| 6287 |
// ggml_conv_1d_1s
|
| 6288 |
|
| 6289 |
struct ggml_tensor * ggml_conv_1d_1s(
|
|
|
|
| 8010 |
const struct ggml_tensor * src0,
|
| 8011 |
const struct ggml_tensor * src1,
|
| 8012 |
struct ggml_tensor * dst) {
|
| 8013 |
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
| 8014 |
|
| 8015 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 8016 |
return;
|
|
|
|
| 8018 |
const int ith = params->ith;
|
| 8019 |
const int nth = params->nth;
|
| 8020 |
|
| 8021 |
+
#ifdef GGML_USE_CUBLAS
|
| 8022 |
+
if (src1->backend == GGML_BACKEND_CUDA) {
|
| 8023 |
+
if (ith == 0) {
|
| 8024 |
+
ggml_cuda_mul(src0, src1, dst);
|
| 8025 |
+
}
|
| 8026 |
+
return;
|
| 8027 |
+
}
|
| 8028 |
+
#endif
|
| 8029 |
+
|
| 8030 |
+
const int64_t nr = ggml_nrows(src0);
|
| 8031 |
+
|
| 8032 |
+
const int64_t ne00 = src0->ne[0];
|
| 8033 |
+
const int64_t ne01 = src0->ne[1];
|
| 8034 |
+
const int64_t ne02 = src0->ne[2];
|
| 8035 |
+
|
| 8036 |
+
const int64_t ne10 = src1->ne[0];
|
| 8037 |
+
const int64_t ne11 = src1->ne[1];
|
| 8038 |
+
const int64_t ne12 = src1->ne[2];
|
| 8039 |
+
const int64_t ne13 = src1->ne[3];
|
| 8040 |
|
| 8041 |
const size_t nb00 = src0->nb[0];
|
| 8042 |
const size_t nb01 = src0->nb[1];
|
|
|
|
| 8055 |
|
| 8056 |
GGML_ASSERT( nb0 == sizeof(float));
|
| 8057 |
GGML_ASSERT(nb00 == sizeof(float));
|
| 8058 |
+
GGML_ASSERT(ne00 == ne10);
|
| 8059 |
|
| 8060 |
if (nb10 == sizeof(float)) {
|
| 8061 |
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
| 8062 |
+
// src0 and dst are same shape => same indices
|
| 8063 |
+
const int64_t i03 = ir/(ne02*ne01);
|
| 8064 |
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
| 8065 |
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
| 8066 |
+
|
| 8067 |
+
const int64_t i13 = i03 % ne13;
|
| 8068 |
+
const int64_t i12 = i02 % ne12;
|
| 8069 |
+
const int64_t i11 = i01 % ne11;
|
| 8070 |
|
| 8071 |
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
| 8072 |
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
| 8073 |
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
| 8074 |
|
| 8075 |
#ifdef GGML_USE_ACCELERATE
|
| 8076 |
UNUSED(ggml_vec_mul_f32);
|
| 8077 |
|
| 8078 |
+
vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8079 |
#else
|
| 8080 |
+
ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
|
|
|
|
|
|
|
|
|
| 8081 |
#endif
|
| 8082 |
// }
|
| 8083 |
// }
|
| 8084 |
}
|
| 8085 |
} else {
|
| 8086 |
// src1 is not contiguous
|
| 8087 |
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
| 8088 |
+
// src0 and dst are same shape => same indices
|
| 8089 |
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
| 8090 |
+
const int64_t i03 = ir/(ne02*ne01);
|
| 8091 |
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
| 8092 |
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
| 8093 |
|
| 8094 |
+
const int64_t i13 = i03 % ne13;
|
| 8095 |
+
const int64_t i12 = i02 % ne12;
|
| 8096 |
+
const int64_t i11 = i01 % ne11;
|
| 8097 |
+
|
| 8098 |
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
| 8099 |
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
| 8100 |
+
|
| 8101 |
+
for (int64_t i0 = 0; i0 < ne00; i0++) {
|
| 8102 |
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
| 8103 |
|
| 8104 |
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
| 8105 |
}
|
|
|
|
| 10588 |
assert(src1->type == GGML_TYPE_I32);
|
| 10589 |
assert(ggml_nelements(src1) == 2);
|
| 10590 |
|
| 10591 |
+
const int ith = params->ith;
|
| 10592 |
+
const int nth = params->nth;
|
| 10593 |
+
|
| 10594 |
const int n_past = ((int32_t *) src1->data)[0];
|
| 10595 |
const bool inplace = (bool)((int32_t *) src1->data)[1];
|
| 10596 |
|
| 10597 |
+
assert(n_past >= 0);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10598 |
|
| 10599 |
+
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
| 10600 |
+
// memcpy needs to be synchronized across threads to avoid race conditions.
|
| 10601 |
+
// => do it in INIT phase
|
| 10602 |
+
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
| 10603 |
+
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
| 10604 |
+
memcpy(
|
| 10605 |
+
((char *) dst->data),
|
| 10606 |
+
((char *) src0->data),
|
| 10607 |
+
ggml_nbytes(dst));
|
| 10608 |
}
|
| 10609 |
|
| 10610 |
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 10611 |
return;
|
| 10612 |
}
|
| 10613 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10614 |
// TODO: handle transposed/permuted matrices
|
| 10615 |
|
| 10616 |
const int n = ggml_nrows(src0);
|
|
|
|
| 10764 |
struct ggml_tensor * dst) {
|
| 10765 |
assert(params->ith == 0);
|
| 10766 |
assert(src1->type == GGML_TYPE_I32);
|
| 10767 |
+
assert(ggml_nelements(src1) == 3);
|
| 10768 |
|
| 10769 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 10770 |
return;
|
| 10771 |
}
|
| 10772 |
|
| 10773 |
+
const int n_past = ((int32_t *) src1->data)[0];
|
| 10774 |
+
const int n_head = ((int32_t *) src1->data)[1];
|
| 10775 |
+
const float max_bias = ((float *) src1->data)[2];
|
| 10776 |
|
| 10777 |
assert(n_past >= 0);
|
| 10778 |
|
|
|
|
| 10795 |
// add alibi to src0 (KQ_scaled)
|
| 10796 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
| 10797 |
|
| 10798 |
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
| 10799 |
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
| 10800 |
|
| 10801 |
for (int i = 0; i < ne0; i++) {
|
| 10802 |
for (int j = 0; j < ne1; j++) {
|
|
|
|
| 10814 |
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
| 10815 |
}
|
| 10816 |
|
| 10817 |
+
pdst[0] = (i-ne0+1) * m_k + src[0];
|
| 10818 |
+
|
| 10819 |
}
|
| 10820 |
}
|
| 10821 |
}
|
| 10822 |
}
|
| 10823 |
|
|
|
|
| 10824 |
static void ggml_compute_forward_alibi_f16(
|
| 10825 |
const struct ggml_compute_params * params,
|
| 10826 |
const struct ggml_tensor * src0,
|
|
|
|
| 10828 |
struct ggml_tensor * dst) {
|
| 10829 |
assert(params->ith == 0);
|
| 10830 |
assert(src1->type == GGML_TYPE_I32);
|
| 10831 |
+
assert(ggml_nelements(src1) == 3);
|
| 10832 |
|
| 10833 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 10834 |
return;
|
| 10835 |
}
|
| 10836 |
|
| 10837 |
+
const int n_past = ((int32_t *) src1->data)[0];
|
| 10838 |
+
const int n_head = ((int32_t *) src1->data)[1];
|
| 10839 |
+
const float max_bias = ((float *) src1->data)[2];
|
| 10840 |
|
| 10841 |
assert(n_past >= 0);
|
| 10842 |
|
|
|
|
| 10859 |
// add alibi to src0 (KQ_scaled)
|
| 10860 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
| 10861 |
|
| 10862 |
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
| 10863 |
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
| 10864 |
|
| 10865 |
for (int i = 0; i < ne0; i++) {
|
| 10866 |
for (int j = 0; j < ne1; j++) {
|
|
|
|
| 10879 |
}
|
| 10880 |
|
| 10881 |
// we return F32
|
| 10882 |
+
pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
|
| 10883 |
}
|
| 10884 |
}
|
| 10885 |
}
|
|
|
|
| 10915 |
}
|
| 10916 |
}
|
| 10917 |
|
| 10918 |
+
|
| 10919 |
+
// ggml_compute_forward_clamp
|
| 10920 |
+
|
| 10921 |
+
static void ggml_compute_forward_clamp_f32(
|
| 10922 |
+
const struct ggml_compute_params * params,
|
| 10923 |
+
const struct ggml_tensor * src0,
|
| 10924 |
+
const struct ggml_tensor * src1,
|
| 10925 |
+
struct ggml_tensor * dst) {
|
| 10926 |
+
assert(params->ith == 0);
|
| 10927 |
+
assert(src1->type == GGML_TYPE_I32);
|
| 10928 |
+
assert(ggml_nelements(src1) == 2);
|
| 10929 |
+
|
| 10930 |
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
| 10931 |
+
return;
|
| 10932 |
+
}
|
| 10933 |
+
|
| 10934 |
+
const int min = ((float *) src1->data)[0];
|
| 10935 |
+
const int max = ((float *) src1->data)[1];
|
| 10936 |
+
|
| 10937 |
+
const int ith = params->ith;
|
| 10938 |
+
const int nth = params->nth;
|
| 10939 |
+
|
| 10940 |
+
const int n = ggml_nrows(src0);
|
| 10941 |
+
const int nc = src0->ne[0];
|
| 10942 |
+
|
| 10943 |
+
const size_t nb00 = src0->nb[0];
|
| 10944 |
+
const size_t nb01 = src0->nb[1];
|
| 10945 |
+
|
| 10946 |
+
const size_t nb0 = dst->nb[0];
|
| 10947 |
+
const size_t nb1 = dst->nb[1];
|
| 10948 |
+
|
| 10949 |
+
GGML_ASSERT( nb0 == sizeof(float));
|
| 10950 |
+
GGML_ASSERT(nb00 == sizeof(float));
|
| 10951 |
+
|
| 10952 |
+
for (int j = ith; j < n; j += nth) {
|
| 10953 |
+
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
| 10954 |
+
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
| 10955 |
+
|
| 10956 |
+
for (int i = 0; i < nc; i++) {
|
| 10957 |
+
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
|
| 10958 |
+
}
|
| 10959 |
+
}
|
| 10960 |
+
}
|
| 10961 |
+
|
| 10962 |
+
static void ggml_compute_forward_clamp(
|
| 10963 |
+
const struct ggml_compute_params * params,
|
| 10964 |
+
const struct ggml_tensor * src0,
|
| 10965 |
+
const struct ggml_tensor * src1,
|
| 10966 |
+
struct ggml_tensor * dst) {
|
| 10967 |
+
switch (src0->type) {
|
| 10968 |
+
case GGML_TYPE_F32:
|
| 10969 |
+
{
|
| 10970 |
+
ggml_compute_forward_clamp_f32(params, src0, src1, dst);
|
| 10971 |
+
} break;
|
| 10972 |
+
case GGML_TYPE_F16:
|
| 10973 |
+
case GGML_TYPE_Q4_0:
|
| 10974 |
+
case GGML_TYPE_Q4_1:
|
| 10975 |
+
case GGML_TYPE_Q5_0:
|
| 10976 |
+
case GGML_TYPE_Q5_1:
|
| 10977 |
+
case GGML_TYPE_Q8_0:
|
| 10978 |
+
case GGML_TYPE_Q8_1:
|
| 10979 |
+
case GGML_TYPE_I8:
|
| 10980 |
+
case GGML_TYPE_I16:
|
| 10981 |
+
case GGML_TYPE_I32:
|
| 10982 |
+
case GGML_TYPE_COUNT:
|
| 10983 |
+
{
|
| 10984 |
+
GGML_ASSERT(false);
|
| 10985 |
+
} break;
|
| 10986 |
+
}
|
| 10987 |
+
}
|
| 10988 |
+
|
| 10989 |
// ggml_compute_forward_rope
|
| 10990 |
|
| 10991 |
static void ggml_compute_forward_rope_f32(
|
|
|
|
| 12967 |
{
|
| 12968 |
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
|
| 12969 |
} break;
|
| 12970 |
+
case GGML_OP_CLAMP:
|
| 12971 |
+
{
|
| 12972 |
+
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
| 12973 |
+
} break;
|
| 12974 |
case GGML_OP_CONV_1D_1S:
|
| 12975 |
{
|
| 12976 |
ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
|
|
|
|
| 13278 |
{
|
| 13279 |
GGML_ASSERT(false); // TODO: not implemented
|
| 13280 |
} break;
|
| 13281 |
+
case GGML_OP_CLAMP:
|
| 13282 |
+
{
|
| 13283 |
+
GGML_ASSERT(false); // TODO: not implemented
|
| 13284 |
+
} break;
|
| 13285 |
case GGML_OP_SILU:
|
| 13286 |
{
|
| 13287 |
// necessary for llama
|
|
|
|
| 14161 |
{
|
| 14162 |
node->n_tasks = 1; //TODO
|
| 14163 |
} break;
|
| 14164 |
+
case GGML_OP_CLAMP:
|
| 14165 |
+
{
|
| 14166 |
+
node->n_tasks = 1; //TODO
|
| 14167 |
+
} break;
|
| 14168 |
case GGML_OP_CONV_1D_1S:
|
| 14169 |
case GGML_OP_CONV_1D_2S:
|
| 14170 |
{
|
ggml.h
CHANGED
|
@@ -190,7 +190,7 @@
|
|
| 190 |
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
| 191 |
#define GGML_FILE_VERSION 1
|
| 192 |
|
| 193 |
-
#define GGML_QNT_VERSION
|
| 194 |
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
| 195 |
|
| 196 |
#define GGML_MAX_DIMS 4
|
|
@@ -313,6 +313,7 @@ extern "C" {
|
|
| 313 |
GGML_OP_ROPE,
|
| 314 |
GGML_OP_ROPE_BACK,
|
| 315 |
GGML_OP_ALIBI,
|
|
|
|
| 316 |
GGML_OP_CONV_1D_1S,
|
| 317 |
GGML_OP_CONV_1D_2S,
|
| 318 |
|
|
@@ -849,7 +850,7 @@ extern "C" {
|
|
| 849 |
int n_past);
|
| 850 |
|
| 851 |
// in-place, returns view(a)
|
| 852 |
-
GGML_API struct ggml_tensor *
|
| 853 |
struct ggml_context * ctx,
|
| 854 |
struct ggml_tensor * a,
|
| 855 |
int n_past);
|
|
@@ -897,7 +898,16 @@ extern "C" {
|
|
| 897 |
struct ggml_context * ctx,
|
| 898 |
struct ggml_tensor * a,
|
| 899 |
int n_past,
|
| 900 |
-
int n_head
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 901 |
|
| 902 |
// padding = 1
|
| 903 |
// TODO: we don't support extra parameters for now
|
|
|
|
| 190 |
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
| 191 |
#define GGML_FILE_VERSION 1
|
| 192 |
|
| 193 |
+
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
| 194 |
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
| 195 |
|
| 196 |
#define GGML_MAX_DIMS 4
|
|
|
|
| 313 |
GGML_OP_ROPE,
|
| 314 |
GGML_OP_ROPE_BACK,
|
| 315 |
GGML_OP_ALIBI,
|
| 316 |
+
GGML_OP_CLAMP,
|
| 317 |
GGML_OP_CONV_1D_1S,
|
| 318 |
GGML_OP_CONV_1D_2S,
|
| 319 |
|
|
|
|
| 850 |
int n_past);
|
| 851 |
|
| 852 |
// in-place, returns view(a)
|
| 853 |
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
| 854 |
struct ggml_context * ctx,
|
| 855 |
struct ggml_tensor * a,
|
| 856 |
int n_past);
|
|
|
|
| 898 |
struct ggml_context * ctx,
|
| 899 |
struct ggml_tensor * a,
|
| 900 |
int n_past,
|
| 901 |
+
int n_head,
|
| 902 |
+
float bias_max);
|
| 903 |
+
|
| 904 |
+
// clamp
|
| 905 |
+
// in-place, returns view(a)
|
| 906 |
+
struct ggml_tensor * ggml_clamp(
|
| 907 |
+
struct ggml_context * ctx,
|
| 908 |
+
struct ggml_tensor * a,
|
| 909 |
+
float min,
|
| 910 |
+
float max);
|
| 911 |
|
| 912 |
// padding = 1
|
| 913 |
// TODO: we don't support extra parameters for now
|