Spaces:
Sleeping
Sleeping
ggml : fix quantized cpy op (llama/12310)
Browse files* ggml : fix quantized cpy op
ggml-ci
* tests : add cpy tests for all types
ggml-ci
* tests : add BF16 copy tests
ggml-ci
* tests : fix loop for same-type copy
ggml-ci
* tests : add option to permute the dst tensor
ggml-ci
- ggml/src/ggml-cpu/ggml-cpu.c +31 -27
ggml/src/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -3110,17 +3110,17 @@ static void ggml_compute_forward_dup_same_cont(
|
|
| 3110 |
const int ith = params->ith; // thread index
|
| 3111 |
const int nth = params->nth; // number of threads
|
| 3112 |
|
| 3113 |
-
// parallelize by
|
| 3114 |
-
const int
|
| 3115 |
-
const int dr = (
|
| 3116 |
-
const int
|
| 3117 |
-
const int
|
| 3118 |
|
| 3119 |
-
if (
|
| 3120 |
memcpy(
|
| 3121 |
-
((char *) dst->data +
|
| 3122 |
-
((char *) src0->data +
|
| 3123 |
-
(
|
| 3124 |
}
|
| 3125 |
}
|
| 3126 |
|
|
@@ -4055,7 +4055,6 @@ static void ggml_compute_forward_dup_f32(
|
|
| 4055 |
static void ggml_compute_forward_dup_bytes(
|
| 4056 |
const struct ggml_compute_params * params,
|
| 4057 |
struct ggml_tensor * dst) {
|
| 4058 |
-
|
| 4059 |
const struct ggml_tensor * src0 = dst->src[0];
|
| 4060 |
|
| 4061 |
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
|
@@ -4069,10 +4068,10 @@ static void ggml_compute_forward_dup_bytes(
|
|
| 4069 |
}
|
| 4070 |
|
| 4071 |
const size_t type_size = ggml_type_size(src0->type);
|
|
|
|
| 4072 |
const int ith = params->ith; // thread index
|
| 4073 |
const int nth = params->nth; // number of threads
|
| 4074 |
|
| 4075 |
-
|
| 4076 |
// parallelize by rows
|
| 4077 |
const int nr = ne01;
|
| 4078 |
// number of rows per thread
|
|
@@ -4082,10 +4081,10 @@ static void ggml_compute_forward_dup_bytes(
|
|
| 4082 |
const int ir1 = MIN(ir0 + dr, nr);
|
| 4083 |
|
| 4084 |
if (src0->type == dst->type &&
|
| 4085 |
-
|
| 4086 |
nb00 == type_size && nb0 == type_size) {
|
| 4087 |
// copy by rows
|
| 4088 |
-
const size_t rs = ne00
|
| 4089 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 4090 |
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
| 4091 |
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
|
@@ -4140,17 +4139,20 @@ static void ggml_compute_forward_dup_bytes(
|
|
| 4140 |
}
|
| 4141 |
|
| 4142 |
// dst counters
|
| 4143 |
-
|
| 4144 |
-
int64_t i10 = 0;
|
| 4145 |
int64_t i11 = 0;
|
| 4146 |
int64_t i12 = 0;
|
| 4147 |
int64_t i13 = 0;
|
| 4148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4149 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 4150 |
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
| 4151 |
-
|
| 4152 |
-
while (
|
| 4153 |
-
|
| 4154 |
if (++i11 == ne1) {
|
| 4155 |
i11 = 0;
|
| 4156 |
if (++i12 == ne2) {
|
|
@@ -4162,14 +4164,14 @@ static void ggml_compute_forward_dup_bytes(
|
|
| 4162 |
}
|
| 4163 |
}
|
| 4164 |
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
| 4165 |
-
for (int64_t
|
| 4166 |
-
const char * src0_ptr = ((char *) src0->data +
|
| 4167 |
-
char * dst_ptr = ((char *) dst->data +
|
| 4168 |
|
| 4169 |
memcpy(dst_ptr, src0_ptr, type_size);
|
| 4170 |
|
| 4171 |
-
if (++
|
| 4172 |
-
|
| 4173 |
if (++i11 == ne1) {
|
| 4174 |
i11 = 0;
|
| 4175 |
if (++i12 == ne2) {
|
|
@@ -4182,9 +4184,9 @@ static void ggml_compute_forward_dup_bytes(
|
|
| 4182 |
}
|
| 4183 |
}
|
| 4184 |
}
|
| 4185 |
-
|
| 4186 |
-
while (
|
| 4187 |
-
|
| 4188 |
if (++i11 == ne1) {
|
| 4189 |
i11 = 0;
|
| 4190 |
if (++i12 == ne2) {
|
|
@@ -14308,7 +14310,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
| 14308 |
}
|
| 14309 |
|
| 14310 |
// extra_buffer op?
|
| 14311 |
-
if (ggml_cpu_extra_compute_forward(params, tensor))
|
|
|
|
|
|
|
| 14312 |
|
| 14313 |
switch (tensor->op) {
|
| 14314 |
case GGML_OP_DUP:
|
|
|
|
| 3110 |
const int ith = params->ith; // thread index
|
| 3111 |
const int nth = params->nth; // number of threads
|
| 3112 |
|
| 3113 |
+
// parallelize by blocks
|
| 3114 |
+
const int nk = ggml_nelements(src0)/ggml_blck_size(src0->type);
|
| 3115 |
+
const int dr = (nk + nth - 1) / nth;
|
| 3116 |
+
const int k0 = dr * ith;
|
| 3117 |
+
const int k1 = MIN(k0 + dr, nk);
|
| 3118 |
|
| 3119 |
+
if (k0 < k1) {
|
| 3120 |
memcpy(
|
| 3121 |
+
((char *) dst->data + k0*nb0),
|
| 3122 |
+
((char *) src0->data + k0*nb0),
|
| 3123 |
+
(k1 - k0) * nb0);
|
| 3124 |
}
|
| 3125 |
}
|
| 3126 |
|
|
|
|
| 4055 |
static void ggml_compute_forward_dup_bytes(
|
| 4056 |
const struct ggml_compute_params * params,
|
| 4057 |
struct ggml_tensor * dst) {
|
|
|
|
| 4058 |
const struct ggml_tensor * src0 = dst->src[0];
|
| 4059 |
|
| 4060 |
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
|
|
|
| 4068 |
}
|
| 4069 |
|
| 4070 |
const size_t type_size = ggml_type_size(src0->type);
|
| 4071 |
+
|
| 4072 |
const int ith = params->ith; // thread index
|
| 4073 |
const int nth = params->nth; // number of threads
|
| 4074 |
|
|
|
|
| 4075 |
// parallelize by rows
|
| 4076 |
const int nr = ne01;
|
| 4077 |
// number of rows per thread
|
|
|
|
| 4081 |
const int ir1 = MIN(ir0 + dr, nr);
|
| 4082 |
|
| 4083 |
if (src0->type == dst->type &&
|
| 4084 |
+
ggml_are_same_shape(src0, dst) &&
|
| 4085 |
nb00 == type_size && nb0 == type_size) {
|
| 4086 |
// copy by rows
|
| 4087 |
+
const size_t rs = ggml_row_size(src0->type, ne00);
|
| 4088 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 4089 |
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
| 4090 |
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
|
|
|
| 4139 |
}
|
| 4140 |
|
| 4141 |
// dst counters
|
| 4142 |
+
int64_t k10 = 0;
|
|
|
|
| 4143 |
int64_t i11 = 0;
|
| 4144 |
int64_t i12 = 0;
|
| 4145 |
int64_t i13 = 0;
|
| 4146 |
|
| 4147 |
+
// number of blocks in a row
|
| 4148 |
+
const int64_t nk00 = ne00 / ggml_blck_size(src0->type);
|
| 4149 |
+
const int64_t nk0 = ne0 / ggml_blck_size(dst->type);
|
| 4150 |
+
|
| 4151 |
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 4152 |
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
| 4153 |
+
k10 += nk00 * ir0;
|
| 4154 |
+
while (k10 >= nk0) {
|
| 4155 |
+
k10 -= nk0;
|
| 4156 |
if (++i11 == ne1) {
|
| 4157 |
i11 = 0;
|
| 4158 |
if (++i12 == ne2) {
|
|
|
|
| 4164 |
}
|
| 4165 |
}
|
| 4166 |
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
| 4167 |
+
for (int64_t k00 = 0; k00 < nk00; k00++) {
|
| 4168 |
+
const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
| 4169 |
+
char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
| 4170 |
|
| 4171 |
memcpy(dst_ptr, src0_ptr, type_size);
|
| 4172 |
|
| 4173 |
+
if (++k10 == nk0) {
|
| 4174 |
+
k10 = 0;
|
| 4175 |
if (++i11 == ne1) {
|
| 4176 |
i11 = 0;
|
| 4177 |
if (++i12 == ne2) {
|
|
|
|
| 4184 |
}
|
| 4185 |
}
|
| 4186 |
}
|
| 4187 |
+
k10 += nk00 * (ne01 - ir1);
|
| 4188 |
+
while (k10 >= nk0) {
|
| 4189 |
+
k10 -= nk0;
|
| 4190 |
if (++i11 == ne1) {
|
| 4191 |
i11 = 0;
|
| 4192 |
if (++i12 == ne2) {
|
|
|
|
| 14310 |
}
|
| 14311 |
|
| 14312 |
// extra_buffer op?
|
| 14313 |
+
if (ggml_cpu_extra_compute_forward(params, tensor)) {
|
| 14314 |
+
return;
|
| 14315 |
+
}
|
| 14316 |
|
| 14317 |
switch (tensor->op) {
|
| 14318 |
case GGML_OP_DUP:
|