ggerganov commited on
Commit
608b377
·
1 Parent(s): 5e508d2

ggml : fix quantized cpy op (llama/12310)

Browse files

* ggml : fix quantized cpy op

ggml-ci

* tests : add cpy tests for all types

ggml-ci

* tests : add BF16 copy tests

ggml-ci

* tests : fix loop for same-type copy

ggml-ci

* tests : add option to permute the dst tensor

ggml-ci

Files changed (1) hide show
  1. ggml/src/ggml-cpu/ggml-cpu.c +31 -27
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -3110,17 +3110,17 @@ static void ggml_compute_forward_dup_same_cont(
3110
  const int ith = params->ith; // thread index
3111
  const int nth = params->nth; // number of threads
3112
 
3113
- // parallelize by elements
3114
- const int ne = ggml_nelements(dst);
3115
- const int dr = (ne + nth - 1) / nth;
3116
- const int ie0 = dr * ith;
3117
- const int ie1 = MIN(ie0 + dr, ne);
3118
 
3119
- if (ie0 < ie1) {
3120
  memcpy(
3121
- ((char *) dst->data + ie0*nb0),
3122
- ((char *) src0->data + ie0*nb0),
3123
- (ie1 - ie0) * nb0);
3124
  }
3125
  }
3126
 
@@ -4055,7 +4055,6 @@ static void ggml_compute_forward_dup_f32(
4055
  static void ggml_compute_forward_dup_bytes(
4056
  const struct ggml_compute_params * params,
4057
  struct ggml_tensor * dst) {
4058
-
4059
  const struct ggml_tensor * src0 = dst->src[0];
4060
 
4061
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -4069,10 +4068,10 @@ static void ggml_compute_forward_dup_bytes(
4069
  }
4070
 
4071
  const size_t type_size = ggml_type_size(src0->type);
 
4072
  const int ith = params->ith; // thread index
4073
  const int nth = params->nth; // number of threads
4074
 
4075
-
4076
  // parallelize by rows
4077
  const int nr = ne01;
4078
  // number of rows per thread
@@ -4082,10 +4081,10 @@ static void ggml_compute_forward_dup_bytes(
4082
  const int ir1 = MIN(ir0 + dr, nr);
4083
 
4084
  if (src0->type == dst->type &&
4085
- ne00 == ne0 &&
4086
  nb00 == type_size && nb0 == type_size) {
4087
  // copy by rows
4088
- const size_t rs = ne00 * type_size;
4089
  for (int64_t i03 = 0; i03 < ne03; i03++) {
4090
  for (int64_t i02 = 0; i02 < ne02; i02++) {
4091
  for (int64_t i01 = ir0; i01 < ir1; i01++) {
@@ -4140,17 +4139,20 @@ static void ggml_compute_forward_dup_bytes(
4140
  }
4141
 
4142
  // dst counters
4143
-
4144
- int64_t i10 = 0;
4145
  int64_t i11 = 0;
4146
  int64_t i12 = 0;
4147
  int64_t i13 = 0;
4148
 
 
 
 
 
4149
  for (int64_t i03 = 0; i03 < ne03; i03++) {
4150
  for (int64_t i02 = 0; i02 < ne02; i02++) {
4151
- i10 += ne00 * ir0;
4152
- while (i10 >= ne0) {
4153
- i10 -= ne0;
4154
  if (++i11 == ne1) {
4155
  i11 = 0;
4156
  if (++i12 == ne2) {
@@ -4162,14 +4164,14 @@ static void ggml_compute_forward_dup_bytes(
4162
  }
4163
  }
4164
  for (int64_t i01 = ir0; i01 < ir1; i01++) {
4165
- for (int64_t i00 = 0; i00 < ne00; i00++) {
4166
- const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4167
- char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
4168
 
4169
  memcpy(dst_ptr, src0_ptr, type_size);
4170
 
4171
- if (++i10 == ne0) {
4172
- i10 = 0;
4173
  if (++i11 == ne1) {
4174
  i11 = 0;
4175
  if (++i12 == ne2) {
@@ -4182,9 +4184,9 @@ static void ggml_compute_forward_dup_bytes(
4182
  }
4183
  }
4184
  }
4185
- i10 += ne00 * (ne01 - ir1);
4186
- while (i10 >= ne0) {
4187
- i10 -= ne0;
4188
  if (++i11 == ne1) {
4189
  i11 = 0;
4190
  if (++i12 == ne2) {
@@ -14308,7 +14310,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14308
  }
14309
 
14310
  // extra_buffer op?
14311
- if (ggml_cpu_extra_compute_forward(params, tensor)) return;
 
 
14312
 
14313
  switch (tensor->op) {
14314
  case GGML_OP_DUP:
 
3110
  const int ith = params->ith; // thread index
3111
  const int nth = params->nth; // number of threads
3112
 
3113
+ // parallelize by blocks
3114
+ const int nk = ggml_nelements(src0)/ggml_blck_size(src0->type);
3115
+ const int dr = (nk + nth - 1) / nth;
3116
+ const int k0 = dr * ith;
3117
+ const int k1 = MIN(k0 + dr, nk);
3118
 
3119
+ if (k0 < k1) {
3120
  memcpy(
3121
+ ((char *) dst->data + k0*nb0),
3122
+ ((char *) src0->data + k0*nb0),
3123
+ (k1 - k0) * nb0);
3124
  }
3125
  }
3126
 
 
4055
  static void ggml_compute_forward_dup_bytes(
4056
  const struct ggml_compute_params * params,
4057
  struct ggml_tensor * dst) {
 
4058
  const struct ggml_tensor * src0 = dst->src[0];
4059
 
4060
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
 
4068
  }
4069
 
4070
  const size_t type_size = ggml_type_size(src0->type);
4071
+
4072
  const int ith = params->ith; // thread index
4073
  const int nth = params->nth; // number of threads
4074
 
 
4075
  // parallelize by rows
4076
  const int nr = ne01;
4077
  // number of rows per thread
 
4081
  const int ir1 = MIN(ir0 + dr, nr);
4082
 
4083
  if (src0->type == dst->type &&
4084
+ ggml_are_same_shape(src0, dst) &&
4085
  nb00 == type_size && nb0 == type_size) {
4086
  // copy by rows
4087
+ const size_t rs = ggml_row_size(src0->type, ne00);
4088
  for (int64_t i03 = 0; i03 < ne03; i03++) {
4089
  for (int64_t i02 = 0; i02 < ne02; i02++) {
4090
  for (int64_t i01 = ir0; i01 < ir1; i01++) {
 
4139
  }
4140
 
4141
  // dst counters
4142
+ int64_t k10 = 0;
 
4143
  int64_t i11 = 0;
4144
  int64_t i12 = 0;
4145
  int64_t i13 = 0;
4146
 
4147
+ // number of blocks in a row
4148
+ const int64_t nk00 = ne00 / ggml_blck_size(src0->type);
4149
+ const int64_t nk0 = ne0 / ggml_blck_size(dst->type);
4150
+
4151
  for (int64_t i03 = 0; i03 < ne03; i03++) {
4152
  for (int64_t i02 = 0; i02 < ne02; i02++) {
4153
+ k10 += nk00 * ir0;
4154
+ while (k10 >= nk0) {
4155
+ k10 -= nk0;
4156
  if (++i11 == ne1) {
4157
  i11 = 0;
4158
  if (++i12 == ne2) {
 
4164
  }
4165
  }
4166
  for (int64_t i01 = ir0; i01 < ir1; i01++) {
4167
+ for (int64_t k00 = 0; k00 < nk00; k00++) {
4168
+ const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4169
+ char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
4170
 
4171
  memcpy(dst_ptr, src0_ptr, type_size);
4172
 
4173
+ if (++k10 == nk0) {
4174
+ k10 = 0;
4175
  if (++i11 == ne1) {
4176
  i11 = 0;
4177
  if (++i12 == ne2) {
 
4184
  }
4185
  }
4186
  }
4187
+ k10 += nk00 * (ne01 - ir1);
4188
+ while (k10 >= nk0) {
4189
+ k10 -= nk0;
4190
  if (++i11 == ne1) {
4191
  i11 = 0;
4192
  if (++i12 == ne2) {
 
14310
  }
14311
 
14312
  // extra_buffer op?
14313
+ if (ggml_cpu_extra_compute_forward(params, tensor)) {
14314
+ return;
14315
+ }
14316
 
14317
  switch (tensor->op) {
14318
  case GGML_OP_DUP: