hipudding commited on
Commit
1b9d0f0
·
1 Parent(s): 8f5d919

CANN: Add support for async operator submission (llama/12864)

Browse files

Submit operators using asynchronous threads to improve performance.

Use the environment variable GGML_CANN_ASYNC_MODE to control whether
asynchronous submission is enabled. It is disabled by default.

Testing shows a 10%–20% performance improvement in scenarios with
small parameter sizes, especially in quantized models.

ggml/src/ggml-cann/aclnn_ops.cpp CHANGED
@@ -103,9 +103,7 @@ void ggml_cann_unary_op(
103
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
104
 
105
  unary_op(ctx, acl_src, acl_dst);
106
-
107
- ACL_CHECK(aclDestroyTensor(acl_src));
108
- ACL_CHECK(aclDestroyTensor(acl_dst));
109
  }
110
 
111
  /**
@@ -123,8 +121,8 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
123
  // repeat tensor along each dim with repeat_array
124
  aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
125
 
126
- GGML_CANN_CALL_ACLNN_OP(Repeat, acl_src, repeats, acl_dst);
127
- ACL_CHECK(aclDestroyIntArray(repeats));
128
  }
129
 
130
  /**
@@ -142,7 +140,7 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
142
  */
143
  static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
144
  aclTensor* acl_dst, aclDataType cast_data_type) {
145
- GGML_CANN_CALL_ACLNN_OP(Cast, acl_src, cast_data_type, acl_dst);
146
  }
147
 
148
  void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -156,8 +154,7 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
156
  dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
157
 
158
  aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
159
- ACL_CHECK(aclDestroyTensor(acl_src));
160
- ACL_CHECK(aclDestroyTensor(acl_dst));
161
  }
162
 
163
  void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
@@ -165,10 +162,10 @@ void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
165
  float alphaValue = 1.0f;
166
  aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
167
  if (acl_dst != nullptr)
168
- GGML_CANN_CALL_ACLNN_OP(Add, acl_src0, acl_src1, alpha, acl_dst);
169
  else
170
- GGML_CANN_CALL_ACLNN_OP(InplaceAdd, acl_src0, acl_src1, alpha);
171
- ACL_CHECK(aclDestroyScalar(alpha));
172
  }
173
 
174
  void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
@@ -176,26 +173,26 @@ void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
176
  float alphaValue = 1.0f;
177
  aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
178
  if (acl_dst != nullptr)
179
- GGML_CANN_CALL_ACLNN_OP(Sub, acl_src0, acl_src1, alpha, acl_dst);
180
  else
181
- GGML_CANN_CALL_ACLNN_OP(InplaceSub, acl_src0, acl_src1, alpha);
182
- ACL_CHECK(aclDestroyScalar(alpha));
183
  }
184
 
185
  void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
186
  aclTensor* acl_other, aclTensor* acl_dst) {
187
  if (acl_dst != nullptr)
188
- GGML_CANN_CALL_ACLNN_OP(Mul, acl_src, acl_other, acl_dst);
189
  else
190
- GGML_CANN_CALL_ACLNN_OP(InplaceMul, acl_src, acl_other);
191
  }
192
 
193
  void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
194
  aclTensor* acl_other, aclTensor* acl_dst) {
195
  if (acl_dst != nullptr)
196
- GGML_CANN_CALL_ACLNN_OP(Div, acl_src, acl_other, acl_dst);
197
  else
198
- GGML_CANN_CALL_ACLNN_OP(InplaceDiv, acl_src, acl_other);
199
  }
200
 
201
  /**
@@ -224,11 +221,11 @@ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
224
  float scale, aclTensor* acl_dst, bool inplace) {
225
  aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
226
  if (inplace) {
227
- GGML_CANN_CALL_ACLNN_OP(InplaceMuls, acl_src, acl_scale);
228
  } else {
229
- GGML_CANN_CALL_ACLNN_OP(Muls, acl_src, acl_scale, acl_dst);
230
  }
231
- ACL_CHECK(aclDestroyScalar(acl_scale));
232
  }
233
 
234
  void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -245,11 +242,8 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
245
  aclScalar* acl_negative_slope =
246
  aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
247
 
248
- GGML_CANN_CALL_ACLNN_OP(LeakyRelu, acl_src, acl_negative_slope, acl_dst);
249
-
250
- ACL_CHECK(aclDestroyScalar(acl_negative_slope));
251
- ACL_CHECK(aclDestroyTensor(acl_src));
252
- ACL_CHECK(aclDestroyTensor(acl_dst));
253
  }
254
 
255
  /**
@@ -265,7 +259,7 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
265
  static void aclnn_concat(ggml_backend_cann_context& ctx,
266
  aclTensorList* tensorList, aclTensor* acl_dst,
267
  int64_t concat_dim) {
268
- GGML_CANN_CALL_ACLNN_OP(Cat, tensorList, concat_dim, acl_dst);
269
  }
270
 
271
  void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -281,11 +275,10 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
281
  int32_t acl_dim = 3 - dim;
282
 
283
  aclTensor* tensors[] = {acl_src0, acl_src1};
284
- aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
285
- aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
286
 
287
- ACL_CHECK(aclDestroyTensorList(tensorList));
288
- ACL_CHECK(aclDestroyTensor(acl_dst));
289
  }
290
 
291
  /**
@@ -315,10 +308,8 @@ static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
315
  aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
316
  aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
317
 
318
- GGML_CANN_CALL_ACLNN_OP(Arange, acl_start, acl_end, acl_step, acl_dst);
319
- ACL_CHECK(aclDestroyScalar(acl_start));
320
- ACL_CHECK(aclDestroyScalar(acl_end));
321
- ACL_CHECK(aclDestroyScalar(acl_step));
322
  }
323
 
324
  void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -335,7 +326,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
335
  memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
336
 
337
  aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
338
- ACL_CHECK(aclDestroyTensor(acl_dst));
339
  }
340
 
341
  void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -352,11 +343,8 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
352
  aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
353
  aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
354
 
355
- GGML_CANN_CALL_ACLNN_OP(Clamp, acl_src, acl_min, acl_max, acl_dst);
356
- ACL_CHECK(aclDestroyScalar(acl_min));
357
- ACL_CHECK(aclDestroyScalar(acl_max));
358
- ACL_CHECK(aclDestroyTensor(acl_src));
359
- ACL_CHECK(aclDestroyTensor(acl_dst));
360
  }
361
 
362
  void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -370,10 +358,8 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
370
  aclTensor* acl_src = ggml_cann_create_tensor(src);
371
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
372
 
373
- GGML_CANN_CALL_ACLNN_OP(Muls, acl_src, scale, acl_dst);
374
- ACL_CHECK(aclDestroyScalar(scale));
375
- ACL_CHECK(aclDestroyTensor(acl_src));
376
- ACL_CHECK(aclDestroyTensor(acl_dst));
377
  }
378
 
379
  void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -388,12 +374,10 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
388
  aclTensor* tmp_tensor =
389
  ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type),
390
  dst->ne, dst->nb, GGML_MAX_DIMS);
391
- GGML_CANN_CALL_ACLNN_OP(Argsort, acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false),
392
  tmp_tensor);
393
- GGML_CANN_CALL_ACLNN_OP(Cast, tmp_tensor, ggml_cann_type_mapping(dst->type), acl_dst);
394
- ACL_CHECK(aclDestroyTensor(acl_src));
395
- ACL_CHECK(aclDestroyTensor(tmp_tensor));
396
- ACL_CHECK(aclDestroyTensor(acl_dst));
397
  }
398
 
399
  void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -407,11 +391,9 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
407
 
408
  std::vector<int64_t> normData = {dst->ne[0]};
409
  aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
410
- GGML_CANN_CALL_ACLNN_OP(LayerNorm, acl_src, norm, nullptr, nullptr,
411
  eps, acl_dst, nullptr, nullptr);
412
- ACL_CHECK(aclDestroyIntArray(norm));
413
- ACL_CHECK(aclDestroyTensor(acl_src));
414
- ACL_CHECK(aclDestroyTensor(acl_dst));
415
  }
416
 
417
  void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -441,12 +423,9 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
441
  aclTensor* acl_rstd_out = ggml_cann_create_tensor(
442
  (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
443
 
444
- GGML_CANN_CALL_ACLNN_OP(GroupNorm, acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps,
445
  acl_dst, acl_mean_out, acl_rstd_out);
446
- ACL_CHECK(aclDestroyTensor(acl_src));
447
- ACL_CHECK(aclDestroyTensor(acl_dst));
448
- ACL_CHECK(aclDestroyTensor(acl_mean_out));
449
- ACL_CHECK(aclDestroyTensor(acl_rstd_out));
450
  }
451
 
452
  void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -471,19 +450,17 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
471
 
472
  if (!inplace) {
473
  size_t cpy_size = ggml_nbytes(dst);
474
- ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size,
475
- ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
476
  aclTensor* acl_src0 = ggml_cann_create_tensor(
477
  src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
478
 
479
- GGML_CANN_CALL_ACLNN_OP(Add, acl_src0, acl_src1, alpha, acl_dst);
480
- ACL_CHECK(aclDestroyTensor(acl_src0));
481
  } else {
482
- GGML_CANN_CALL_ACLNN_OP(InplaceAdd, acl_dst, acl_src1, alpha);
483
  }
484
-
485
- ACL_CHECK(aclDestroyTensor(acl_src1));
486
- ACL_CHECK(aclDestroyTensor(acl_dst));
487
  }
488
 
489
  /**
@@ -496,7 +473,6 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
496
  * @param dim An array of dimension indices.
497
  * @param dim_size The number of dimensions.
498
  */
499
-
500
  static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst,
501
  int64_t* dim, size_t dim_size) {
502
  GGML_ASSERT(dst->ne[0] == 1);
@@ -505,11 +481,9 @@ static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst,
505
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
506
  aclIntArray* reduce_dims = aclCreateIntArray(dim, dim_size);
507
 
508
- GGML_CANN_CALL_ACLNN_OP(ReduceSum, acl_src, reduce_dims, true,
509
  ggml_cann_type_mapping(dst->type), acl_dst);
510
- ACL_CHECK(aclDestroyTensor(acl_src));
511
- ACL_CHECK(aclDestroyTensor(acl_dst));
512
- ACL_CHECK(aclDestroyIntArray(reduce_dims));
513
  }
514
 
515
  void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -533,10 +507,8 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
533
  std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
534
  auto output_size_array = aclCreateIntArray(output_size.data(), 2);
535
 
536
- GGML_CANN_CALL_ACLNN_OP(UpsampleNearest2d, acl_src, output_size_array, acl_dst);
537
- ACL_CHECK(aclDestroyIntArray(output_size_array));
538
- ACL_CHECK(aclDestroyTensor(acl_src));
539
- ACL_CHECK(aclDestroyTensor(acl_dst));
540
  }
541
 
542
  /**
@@ -559,9 +531,8 @@ static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
559
  aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
560
  aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
561
 
562
- GGML_CANN_CALL_ACLNN_OP(ConstantPadNd, acl_src, acl_pad, acl_value, acl_dst);
563
- ACL_CHECK(aclDestroyIntArray(acl_pad));
564
- ACL_CHECK(aclDestroyScalar(acl_value));
565
  }
566
 
567
  void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -577,9 +548,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
577
  0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
578
  0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
579
  aclnn_pad(ctx, acl_src, acl_dst, paddings);
580
-
581
- ACL_CHECK(aclDestroyTensor(acl_dst));
582
- ACL_CHECK(aclDestroyTensor(acl_src));
583
  }
584
 
585
  /**
@@ -629,14 +598,11 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
629
  cube_math_type = 1;
630
  #endif
631
 
632
- GGML_CANN_CALL_ACLNN_OP(AvgPool2d, acl_src, kernel_size, strides, paddings_avg,
633
  ceil_mode, count_include_pad, divisor_override,
634
  cube_math_type, acl_dst);
635
- ACL_CHECK(aclDestroyTensor(acl_src));
636
- ACL_CHECK(aclDestroyTensor(acl_dst));
637
- ACL_CHECK(aclDestroyIntArray(kernel_size));
638
- ACL_CHECK(aclDestroyIntArray(strides));
639
- ACL_CHECK(aclDestroyIntArray(paddings_avg));
640
  }
641
 
642
  /**
@@ -704,15 +670,10 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
704
 
705
  bool ceil_mode = false;
706
  int64_t auto_pads = 0;
707
- GGML_CANN_CALL_ACLNN_OP(MaxPool, tmp_tensor, kernel_size, strides, auto_pads,
708
  paddings_max, dilations, ceil_mode, acl_dst);
709
- ACL_CHECK(aclDestroyTensor(acl_src));
710
- ACL_CHECK(aclDestroyTensor(acl_dst));
711
- ACL_CHECK(aclDestroyTensor(tmp_tensor));
712
- ACL_CHECK(aclDestroyIntArray(kernel_size));
713
- ACL_CHECK(aclDestroyIntArray(strides));
714
- ACL_CHECK(aclDestroyIntArray(paddings_max));
715
- ACL_CHECK(aclDestroyIntArray(dilations));
716
  }
717
 
718
  void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -743,7 +704,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
743
  */
744
  static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
745
  aclTensor* acl_dst) {
746
- GGML_CANN_CALL_ACLNN_OP(InplaceCopy, acl_dst, acl_src);
747
  }
748
 
749
  void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -761,9 +722,8 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
761
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
762
  if (dst->type == src0->type) {
763
  size_t cpy_size = ggml_nbytes(dst);
764
- ACL_CHECK(aclrtMemcpyAsync(
765
- dst->data, cpy_size, src0->data, cpy_size,
766
- ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
767
  return;
768
  } else {
769
  ggml_cann_pool_alloc src_buffer_allocator(
@@ -782,10 +742,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
782
 
783
  aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
784
  size_t cpy_size = ggml_nbytes(dst);
785
- ACL_CHECK(aclrtMemcpyAsync(
786
- dst->data, cpy_size, src_trans_buffer, cpy_size,
787
- ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
788
- ACL_CHECK(aclDestroyTensor(src_trans_tensor));
789
  return;
790
  }
791
  } else if (ggml_is_contiguous(dst)) {
@@ -805,18 +764,15 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
805
  aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
806
 
807
  size_t cpy_size = ggml_nbytes(dst);
808
- ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer,
809
- cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
810
- ctx.stream()));
811
- ACL_CHECK(aclDestroyTensor(src_trans_tensor));
812
  return;
813
  } else {
814
  GGML_ABORT("Unsupport dst is not tontiguous.");
815
  }
816
  }
817
-
818
- ACL_CHECK(aclDestroyTensor(acl_src));
819
- ACL_CHECK(aclDestroyTensor(acl_dst));
820
  }
821
 
822
  /**
@@ -844,7 +800,7 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
844
  nb[i] = nb[i - 1] * ne[i - 1];
845
  }
846
 
847
- ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream()));
848
  aclTensor* zero =
849
  ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
850
  return zero;
@@ -877,7 +833,7 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
877
  float alpha_host = 1.0f;
878
  aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
879
  aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
880
- GGML_CANN_CALL_ACLNN_OP(InplaceAdds, acl_tensor, other, alpha);
881
  return acl_tensor;
882
  }
883
 
@@ -903,11 +859,8 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
903
  aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
904
  src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
905
  ggml_element_size(src));
906
- GGML_CANN_CALL_ACLNN_OP(RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
907
- ACL_CHECK(aclDestroyTensor(acl_src));
908
- ACL_CHECK(aclDestroyTensor(acl_dst));
909
- ACL_CHECK(aclDestroyTensor(acl_gamma));
910
- ACL_CHECK(aclDestroyTensor(acl_rstd));
911
  }
912
 
913
  // TODO: performace is low.
@@ -933,13 +886,10 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
933
  float alphaValue = 1.0f;
934
  alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
935
 
936
- GGML_CANN_CALL_ACLNN_OP(InplaceTriu, mask_tensor, n_past + 1);
937
- GGML_CANN_CALL_ACLNN_OP(Tril, acl_src, n_past + 1, acl_dst);
938
- GGML_CANN_CALL_ACLNN_OP(InplaceAdd, acl_dst, mask_tensor, alpha);
939
- ACL_CHECK(aclDestroyScalar(alpha));
940
- ACL_CHECK(aclDestroyTensor(mask_tensor));
941
- ACL_CHECK(aclDestroyTensor(acl_src));
942
- ACL_CHECK(aclDestroyTensor(acl_dst));
943
  }
944
 
945
  /**
@@ -960,7 +910,8 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
960
  static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
961
  aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
962
  aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
963
- GGML_CANN_CALL_ACLNN_OP(Permute, acl_src, acl_dims, acl_dst);
 
964
  }
965
 
966
  static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
@@ -981,8 +932,7 @@ static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
981
  aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
982
  }
983
 
984
- // release
985
- ACL_CHECK(aclDestroyTensor(acl_dst));
986
  }
987
 
988
  static void ggml_cann_im2col_1d_post_process(
@@ -1004,7 +954,6 @@ static void ggml_cann_im2col_1d_post_process(
1004
 
1005
  // Permute: [N, IC * KH * KW, OW * OH] ->
1006
  // [N, OW * OH * n_bytes_factor, IC * KH * KW]
1007
- aclTensor* tmp_permute_tensor = nullptr;
1008
  ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
1009
  tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1010
  void* tmp_permute_buffer = tmp_permute_allocator.get();
@@ -1016,7 +965,7 @@ static void ggml_cann_im2col_1d_post_process(
1016
  tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1017
  }
1018
 
1019
- tmp_permute_tensor = ggml_cann_create_tensor(
1020
  tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
1021
  ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
1022
  GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
@@ -1046,9 +995,8 @@ static void ggml_cann_im2col_1d_post_process(
1046
  c * KH * KW * n_step_w * ggml_type_size(dst->type);
1047
 
1048
  for (int i = 0; i < n_step_w; i++) {
1049
- ACL_CHECK(aclrtMemcpyAsync(
1050
- cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
1051
- ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1052
  cur_dst_buffer =
1053
  (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
1054
  cur_permute_buffer = (char*)cur_permute_buffer +
@@ -1058,13 +1006,11 @@ static void ggml_cann_im2col_1d_post_process(
1058
  } else {
1059
  offset = KH * KW * n_step_w *
1060
  ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
1061
- ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
1062
- (char*)tmp_permute_buffer + offset, offset,
1063
- ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1064
  }
1065
 
1066
- // release
1067
- ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
1068
  }
1069
 
1070
  void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -1126,7 +1072,7 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1126
  auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
1127
  auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
1128
  auto* strides = aclCreateIntArray(stride_dims.data(), 2);
1129
- GGML_CANN_CALL_ACLNN_OP(Im2col, acl_src1, kernel_size, dilations,
1130
  paddings, strides, tmp_im2col_tensor);
1131
 
1132
  // Cast if dst is f16.
@@ -1160,14 +1106,8 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1160
  tmp_im2col_tensor, im2col_op_params);
1161
  }
1162
 
1163
- // release
1164
- ACL_CHECK(aclDestroyTensor(acl_src1));
1165
- ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
1166
- ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
1167
- ACL_CHECK(aclDestroyIntArray(kernel_size));
1168
- ACL_CHECK(aclDestroyIntArray(dilations));
1169
- ACL_CHECK(aclDestroyIntArray(paddings));
1170
- ACL_CHECK(aclDestroyIntArray(strides));
1171
  }
1172
 
1173
  /**
@@ -1184,17 +1124,17 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1184
  * @param acl_src The tensor on which the exponential function will be applied.
1185
  */
1186
  static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
1187
- GGML_CANN_CALL_ACLNN_OP(InplaceExp, acl_src);
1188
  }
1189
 
1190
  void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1191
  aclTensor* acl_dst) {
1192
- GGML_CANN_CALL_ACLNN_OP(Cos, acl_src, acl_dst);
1193
  }
1194
 
1195
  void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1196
  aclTensor* acl_dst) {
1197
- GGML_CANN_CALL_ACLNN_OP(Sin, acl_src, acl_dst);
1198
  }
1199
 
1200
  void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
@@ -1243,13 +1183,13 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1243
 
1244
  ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
1245
  void* tmp_permute_buffer = permute_allocator.get();
1246
- aclTensor* tmp_permute_tenosr = ggml_cann_create_tensor(
1247
  tmp_permute_buffer, ggml_cann_type_mapping(src->type),
1248
  ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb,
1249
  GGML_MAX_DIMS, ACL_FORMAT_ND);
1250
  int64_t permute_dim[] = {0, 1, 3, 2};
1251
  int64_t num_dims = 4;
1252
- aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims);
1253
 
1254
  // timestep * freq
1255
  int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
@@ -1270,7 +1210,7 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1270
  tmp_mul_buffer, ggml_cann_type_mapping(src->type),
1271
  ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1272
  ACL_FORMAT_ND);
1273
- aclnn_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, tmp_mul_tensor);
1274
 
1275
  // cos
1276
  ggml_cann_pool_alloc cos_allocator(
@@ -1298,17 +1238,13 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1298
  int64_t concat_dim = 3;
1299
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1300
  aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
1301
- aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
1302
- aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
1303
 
1304
  // release
1305
  // segmentation fault when delete both tensorList and his elements.
1306
- ACL_CHECK(aclDestroyTensorList(tensorList));
1307
- ACL_CHECK(aclDestroyTensor(acl_src));
1308
- ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
1309
- ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr));
1310
- ACL_CHECK(aclDestroyTensor(tmp_mul_tensor));
1311
- ACL_CHECK(aclDestroyTensor(acl_dst));
1312
  }
1313
 
1314
  /**
@@ -1324,8 +1260,8 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1324
  static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
1325
  aclTensor* acl_dst) {
1326
  auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
1327
- GGML_CANN_CALL_ACLNN_OP(InplaceFillScalar, acl_dst, acl_scalar);
1328
- ACL_CHECK(aclDestroyScalar(acl_scalar));
1329
  }
1330
 
1331
  /**
@@ -1346,7 +1282,7 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
1346
  */
1347
  static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
1348
  aclTensor* acl_dst, aclTensor* acl_exp) {
1349
- GGML_CANN_CALL_ACLNN_OP(InplacePowTensorTensor, acl_dst, acl_exp);
1350
  }
1351
 
1352
  /**
@@ -1498,15 +1434,9 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1498
 
1499
  // add
1500
  aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
1501
-
1502
- ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor));
1503
- ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor));
1504
- ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor));
1505
- ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor));
1506
- ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor));
1507
- ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
1508
- ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
1509
- ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
1510
  }
1511
 
1512
  void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -1529,7 +1459,7 @@ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1529
  */
1530
  static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1531
  int64_t dim, aclTensor* acl_dst) {
1532
- GGML_CANN_CALL_ACLNN_OP(Softmax, acl_src, dim, acl_dst);
1533
  }
1534
 
1535
  void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -1579,8 +1509,7 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1579
  src1_fp32_nb, GGML_MAX_DIMS);
1580
  aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
1581
  aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
1582
-
1583
- ACL_CHECK(aclDestroyTensor(acl_src1));
1584
  } else {
1585
  acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
1586
  }
@@ -1633,17 +1562,13 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1633
 
1634
  // softmax
1635
  aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
1636
- ACL_CHECK(aclDestroyTensor(alibi_output_tensor));
1637
  } else {
1638
  aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
1639
  }
1640
 
1641
- ACL_CHECK(aclDestroyTensor(acl_src0));
1642
- ACL_CHECK(aclDestroyTensor(acl_src1_fp32_tensor));
1643
- ACL_CHECK(aclDestroyTensor(acl_dst));
1644
- ACL_CHECK(aclDestroyScalar(acl_scale));
1645
- ACL_CHECK(aclDestroyTensor(acl_input_mul_scale_tensor));
1646
- ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
1647
  }
1648
 
1649
  /**
@@ -1690,10 +1615,8 @@ static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
1690
  (char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
1691
  ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
1692
  acl_out_ne, acl_out_nb, 2);
1693
- GGML_CANN_CALL_ACLNN_OP(Embedding, acl_src_tensor, acl_index, acl_out);
1694
- ACL_CHECK(aclDestroyTensor(acl_src_tensor));
1695
- ACL_CHECK(aclDestroyTensor(acl_index));
1696
- ACL_CHECK(aclDestroyTensor(acl_out));
1697
  }
1698
  }
1699
  }
@@ -1724,8 +1647,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1724
  aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
1725
  aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
1726
  src_trans_nb, src1, dst);
1727
- ACL_CHECK(aclDestroyTensor(acl_src0));
1728
- ACL_CHECK(aclDestroyTensor(src_trans_tensor));
1729
  break;
1730
  }
1731
  case GGML_TYPE_Q8_0: {
@@ -1787,7 +1709,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1787
  aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
1788
  dequant_ne, dequant_nb, src1, dst);
1789
 
1790
- ACL_CHECK(aclDestroyTensor(dequant_tensor));
1791
  break;
1792
  }
1793
  default:
@@ -1815,7 +1737,7 @@ static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
1815
  aclTensor* acl_src, aclTensor* acl_dst,
1816
  int64_t dim, int64_t repeats,
1817
  int64_t output_size) {
1818
- GGML_CANN_CALL_ACLNN_OP(RepeatInterleaveIntWithDim, acl_src, repeats, dim,
1819
  output_size, acl_dst);
1820
  }
1821
 
@@ -1864,21 +1786,19 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
1864
 
1865
  switch (n_dims) {
1866
  case 2:
1867
- GGML_CANN_CALL_ACLNN_OP(Mm, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
1868
  break;
1869
  case 3:
1870
- GGML_CANN_CALL_ACLNN_OP(BatchMatMul, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
1871
  break;
1872
  default:
1873
  // ALLOW_FP32_DOWN_PRECISION, when input is
1874
  // fp32, atlas a2 will transpose it to HFLOAT32.
1875
- GGML_CANN_CALL_ACLNN_OP(Matmul, acl_input_tensor, acl_weight_tensor, acl_dst, 1);
1876
  break;
1877
  }
1878
 
1879
- ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
1880
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
1881
- ACL_CHECK(aclDestroyTensor(acl_dst));
1882
  }
1883
 
1884
  /**
@@ -1948,9 +1868,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
1948
  input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
1949
  input_cast_nb, GGML_MAX_DIMS);
1950
  aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
1951
-
1952
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
1953
- ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
1954
  }
1955
 
1956
  // output
@@ -2003,13 +1921,11 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2003
  if (src0->ne[0] > QK8_0) {
2004
  antiquantGroupSize = QK8_0;
2005
  }
2006
- GGML_CANN_CALL_ACLNN_OP(WeightQuantBatchMatmulV2, acl_input_tensor,
2007
  acl_weight_tensor, acl_scale_tensor, nullptr,
2008
  nullptr, nullptr, nullptr, antiquantGroupSize,
2009
  acl_output_tensor);
2010
- ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2011
- ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
2012
- ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2013
 
2014
  // other splits
2015
  for (int64_t split = 1; split < split_size; split++) {
@@ -2036,16 +1952,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2036
  (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
2037
  output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
2038
  output_ne_offset);
2039
- GGML_CANN_CALL_ACLNN_OP(WeightQuantBatchMatmulV2, acl_input_tensor,
2040
  acl_weight_tensor, acl_scale_tensor, nullptr,
2041
  nullptr, nullptr, nullptr, antiquantGroupSize,
2042
  acl_output_tensor);
2043
- ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2044
- ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
2045
- ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2046
  }
2047
 
2048
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2049
  }
2050
  }
2051
 
@@ -2064,8 +1978,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2064
  aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
2065
  aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
2066
 
2067
- ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2068
- ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
2069
  }
2070
  }
2071
 
@@ -2106,9 +2019,8 @@ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2106
  aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
2107
  aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
2108
  aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
2109
- GGML_CANN_CALL_ACLNN_OP(Roll, acl_src, acl_shifts, acl_dims, acl_dst);
2110
- ACL_CHECK(aclDestroyIntArray(acl_shifts));
2111
- ACL_CHECK(aclDestroyIntArray(acl_dims));
2112
  }
2113
 
2114
  /**
@@ -2130,9 +2042,8 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
2130
  float value) {
2131
  aclIntArray* acl_index = aclCreateIntArray(index, index_num);
2132
  aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
2133
- GGML_CANN_CALL_ACLNN_OP(InplaceIndexFillTensor, acl_src, dim, acl_index, acl_value);
2134
- ACL_CHECK(aclDestroyIntArray(acl_index));
2135
- ACL_CHECK(aclDestroyScalar(acl_value));
2136
  }
2137
 
2138
  static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
@@ -2169,7 +2080,8 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2169
 
2170
  // power
2171
  aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
2172
- GGML_CANN_CALL_ACLNN_OP(PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, acl_theta_scale_tensor);
 
2173
 
2174
  // freq_scale
2175
  if (freq_scale != 1) {
@@ -2182,7 +2094,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2182
  src2->data, ggml_cann_type_mapping(src2->type),
2183
  ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2184
  aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor);
2185
- ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
2186
  }
2187
 
2188
  // position
@@ -2251,12 +2163,8 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2251
  }
2252
 
2253
  // release
2254
- ACL_CHECK(aclDestroyTensor(acl_theta_scale_tensor));
2255
- ACL_CHECK(aclDestroyTensor(acl_position_tensor));
2256
- ACL_CHECK(aclDestroyTensor(acl_theta_tensor));
2257
- ACL_CHECK(aclDestroyTensor(acl_sin_tensor));
2258
- ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
2259
- ACL_CHECK(aclDestroyScalar(acl_theta_scale));
2260
  }
2261
 
2262
  #ifdef __cplusplus
@@ -2368,8 +2276,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2368
  int64_t shifts[] = {1};
2369
  int64_t dims[] = {3};
2370
  aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2371
- ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
2372
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2373
 
2374
  // init [-1, 1, -1, 1, ...]
2375
  minus_one_scale_buffer = minus_one_scale_allocator.get();
@@ -2405,8 +2312,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2405
  int64_t dims[] = {3};
2406
  aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2407
 
2408
- ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
2409
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2410
  // init [-1, -1, -1, 1, 1,1,...]
2411
  minus_one_scale_buffer = minus_one_scale_allocator.get();
2412
  int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
@@ -2431,7 +2337,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2431
  bool inplace = true;
2432
  float scale = -1;
2433
  aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
2434
- ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
2435
  }
2436
 
2437
  // TODO: n_dims < ne0
@@ -2496,14 +2402,10 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2496
  output_fp32_tensor);
2497
  aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
2498
 
2499
- ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
2500
- ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
2501
- ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
2502
- ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
2503
- ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
2504
- ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
2505
- ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
2506
- ACL_CHECK(aclDestroyTensor(acl_src));
2507
  }
2508
  return;
2509
  #endif
@@ -2513,8 +2415,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2513
 
2514
  switch (src0->type) {
2515
  case GGML_TYPE_F32: {
2516
- GGML_CANN_CALL_ACLNN_OP(RotaryPositionEmbedding, acl_src, acl_cos_reshape_tensor,
2517
- acl_sin_reshape_tensor, acl_mode, acl_dst);
2518
  break;
2519
  }
2520
  case GGML_TYPE_F16: {
@@ -2540,23 +2442,22 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2540
 
2541
  aclnn_cast(ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT);
2542
 
2543
- GGML_CANN_CALL_ACLNN_OP(RotaryPositionEmbedding, acl_src_trans_tensor, acl_cos_reshape_tensor,
2544
- acl_sin_reshape_tensor, acl_mode, acl_dst_trans_tensor);
 
2545
 
2546
  aclnn_cast(ctx, acl_dst_trans_tensor, acl_dst, ACL_FLOAT16);
2547
 
2548
- ACL_CHECK(aclDestroyTensor(acl_src_trans_tensor));
2549
- ACL_CHECK(aclDestroyTensor(acl_dst_trans_tensor));
2550
  break;
2551
  }
2552
  default:
2553
  GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
2554
  break;
2555
  }
2556
- ACL_CHECK(aclDestroyTensor(acl_src));
2557
- ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
2558
- ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
2559
- ACL_CHECK(aclDestroyTensor(acl_dst));
2560
  }
2561
 
2562
 
@@ -2566,10 +2467,9 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2566
  aclTensor* acl_src = ggml_cann_create_tensor(src0);
2567
  aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
2568
 
2569
- GGML_CANN_CALL_ACLNN_OP(ArgMax, acl_src, 3, false, acl_dst);
2570
 
2571
- ACL_CHECK(aclDestroyTensor(acl_src));
2572
- ACL_CHECK(aclDestroyTensor(acl_dst));
2573
  }
2574
 
2575
  void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
@@ -2598,14 +2498,10 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds
2598
  cubeMathType = 1;
2599
  #endif
2600
 
2601
- GGML_CANN_CALL_ACLNN_OP(Convolution, acl_input, acl_weight, nullptr, stride,
2602
  padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
2603
 
2604
- ACL_CHECK(aclDestroyTensor(acl_weight));
2605
- ACL_CHECK(aclDestroyTensor(acl_dst));
2606
- ACL_CHECK(aclDestroyIntArray(stride));
2607
- ACL_CHECK(aclDestroyIntArray(padding));
2608
- ACL_CHECK(aclDestroyIntArray(dilation));
2609
  }
2610
 
2611
  void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
@@ -2618,12 +2514,10 @@ void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2618
  aclScalar* alpha = nullptr;
2619
  alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2620
 
2621
- GGML_CANN_CALL_ACLNN_OP(Elu, acl_input, alpha, alpha, alpha,
2622
  acl_dst);
2623
 
2624
- ACL_CHECK(aclDestroyTensor(acl_input));
2625
- ACL_CHECK(aclDestroyTensor(acl_dst));
2626
- ACL_CHECK(aclDestroyScalar(alpha));
2627
  }
2628
 
2629
  void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst){
@@ -2636,11 +2530,9 @@ void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2636
  aclIntArray* reduceDim = aclCreateIntArray(reduceDimValue, 1);
2637
  bool keepDim = true;
2638
 
2639
- GGML_CANN_CALL_ACLNN_OP(Mean, acl_src, reduceDim, keepDim, ACL_FLOAT, acl_dst);
2640
 
2641
- ACL_CHECK(aclDestroyTensor(acl_src));
2642
- ACL_CHECK(aclDestroyTensor(acl_dst));
2643
- ACL_CHECK(aclDestroyIntArray(reduceDim));
2644
  }
2645
 
2646
  void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
@@ -2660,12 +2552,11 @@ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2660
  ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
2661
  dst->ne, dst->nb, 3);
2662
 
2663
- GGML_CANN_CALL_ACLNN_OP(ReflectionPad1d, acl_src, paddings, acl_dst);
2664
 
2665
- ACL_CHECK(aclDestroyTensor(acl_src));
2666
- ACL_CHECK(aclDestroyTensor(acl_dst));
2667
  }
2668
- ACL_CHECK(aclDestroyIntArray(paddings));
2669
  }
2670
 
2671
  void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst){
@@ -2675,12 +2566,11 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2675
  aclTensor* acl_self = ggml_cann_create_tensor(src0);
2676
  aclTensor* acl_other = ggml_cann_create_tensor(src1);
2677
 
2678
- GGML_CANN_CALL_ACLNN_OP(InplaceEqTensor, acl_self, acl_other);
2679
 
2680
  ggml_cann_sum(ctx, dst);
2681
 
2682
- ACL_CHECK(aclDestroyTensor(acl_self));
2683
- ACL_CHECK(aclDestroyTensor(acl_other));
2684
  }
2685
 
2686
  void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
@@ -2693,9 +2583,7 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2693
  aclScalar* alpha = nullptr;
2694
  alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2695
 
2696
- GGML_CANN_CALL_ACLNN_OP(GtScalar, acl_src, alpha, acl_dst);
2697
 
2698
- ACL_CHECK(aclDestroyTensor(acl_src));
2699
- ACL_CHECK(aclDestroyTensor(acl_dst));
2700
- ACL_CHECK(aclDestroyScalar(alpha));
2701
  }
 
103
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
104
 
105
  unary_op(ctx, acl_src, acl_dst);
106
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
 
 
107
  }
108
 
109
  /**
 
121
  // repeat tensor along each dim with repeat_array
122
  aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
123
 
124
+ GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats, acl_dst);
125
+ ggml_cann_release_resources(ctx, repeats);
126
  }
127
 
128
  /**
 
140
  */
141
  static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
142
  aclTensor* acl_dst, aclDataType cast_data_type) {
143
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src, cast_data_type, acl_dst);
144
  }
145
 
146
  void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
154
  dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
155
 
156
  aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
157
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
 
158
  }
159
 
160
  void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
 
162
  float alphaValue = 1.0f;
163
  aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
164
  if (acl_dst != nullptr)
165
+ GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
166
  else
167
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_src0, acl_src1, alpha);
168
+ ggml_cann_release_resources(ctx, alpha);
169
  }
170
 
171
  void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
 
173
  float alphaValue = 1.0f;
174
  aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
175
  if (acl_dst != nullptr)
176
+ GGML_CANN_CALL_ACLNN_OP(ctx, Sub, acl_src0, acl_src1, alpha, acl_dst);
177
  else
178
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSub, acl_src0, acl_src1, alpha);
179
+ ggml_cann_release_resources(ctx, alpha);
180
  }
181
 
182
  void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
183
  aclTensor* acl_other, aclTensor* acl_dst) {
184
  if (acl_dst != nullptr)
185
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_src, acl_other, acl_dst);
186
  else
187
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_src, acl_other);
188
  }
189
 
190
  void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
191
  aclTensor* acl_other, aclTensor* acl_dst) {
192
  if (acl_dst != nullptr)
193
+ GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src, acl_other, acl_dst);
194
  else
195
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDiv, acl_src, acl_other);
196
  }
197
 
198
  /**
 
221
  float scale, aclTensor* acl_dst, bool inplace) {
222
  aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
223
  if (inplace) {
224
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale);
225
  } else {
226
+ GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, acl_scale, acl_dst);
227
  }
228
+ ggml_cann_release_resources(ctx, acl_scale);
229
  }
230
 
231
  void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
242
  aclScalar* acl_negative_slope =
243
  aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
244
 
245
+ GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src, acl_negative_slope, acl_dst);
246
+ ggml_cann_release_resources(ctx, acl_negative_slope, acl_src, acl_dst);
 
 
 
247
  }
248
 
249
  /**
 
259
  static void aclnn_concat(ggml_backend_cann_context& ctx,
260
  aclTensorList* tensorList, aclTensor* acl_dst,
261
  int64_t concat_dim) {
262
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cat, tensorList, concat_dim, acl_dst);
263
  }
264
 
265
  void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
275
  int32_t acl_dim = 3 - dim;
276
 
277
  aclTensor* tensors[] = {acl_src0, acl_src1};
278
+ aclTensorList* tensor_list = aclCreateTensorList(tensors, 2);
279
+ aclnn_concat(ctx, tensor_list, acl_dst, acl_dim);
280
 
281
+ ggml_cann_release_resources(ctx, tensor_list, acl_dst);
 
282
  }
283
 
284
  /**
 
308
  aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
309
  aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
310
 
311
+ GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start, acl_end, acl_step, acl_dst);
312
+ ggml_cann_release_resources(ctx, acl_start, acl_end, acl_step);
 
 
313
  }
314
 
315
  void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
326
  memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
327
 
328
  aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
329
+ ggml_cann_release_resources(ctx, acl_dst);
330
  }
331
 
332
  void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
343
  aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
344
  aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
345
 
346
+ GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src, acl_min, acl_max, acl_dst);
347
+ ggml_cann_release_resources(ctx, acl_min, acl_max, acl_src, acl_dst);
 
 
 
348
  }
349
 
350
  void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
358
  aclTensor* acl_src = ggml_cann_create_tensor(src);
359
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
360
 
361
+ GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, scale, acl_dst);
362
+ ggml_cann_release_resources(ctx, scale, acl_src, acl_dst);
 
 
363
  }
364
 
365
  void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
374
  aclTensor* tmp_tensor =
375
  ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type),
376
  dst->ne, dst->nb, GGML_MAX_DIMS);
377
+ GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false),
378
  tmp_tensor);
379
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor, ggml_cann_type_mapping(dst->type), acl_dst);
380
+ ggml_cann_release_resources(ctx, acl_src, tmp_tensor, acl_dst);
 
 
381
  }
382
 
383
  void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
391
 
392
  std::vector<int64_t> normData = {dst->ne[0]};
393
  aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
394
+ GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src, norm, nullptr, nullptr,
395
  eps, acl_dst, nullptr, nullptr);
396
+ ggml_cann_release_resources(ctx, norm, acl_src, acl_dst);
 
 
397
  }
398
 
399
  void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
423
  aclTensor* acl_rstd_out = ggml_cann_create_tensor(
424
  (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
425
 
426
+ GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps,
427
  acl_dst, acl_mean_out, acl_rstd_out);
428
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_mean_out, acl_rstd_out);
 
 
 
429
  }
430
 
431
  void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
450
 
451
  if (!inplace) {
452
  size_t cpy_size = ggml_nbytes(dst);
453
+ ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
454
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
455
  aclTensor* acl_src0 = ggml_cann_create_tensor(
456
  src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
457
 
458
+ GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
459
+ ggml_cann_release_resources(ctx, acl_src0);
460
  } else {
461
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, acl_src1, alpha);
462
  }
463
+ ggml_cann_release_resources(ctx, acl_src1, acl_dst);
 
 
464
  }
465
 
466
  /**
 
473
  * @param dim An array of dimension indices.
474
  * @param dim_size The number of dimensions.
475
  */
 
476
  static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst,
477
  int64_t* dim, size_t dim_size) {
478
  GGML_ASSERT(dst->ne[0] == 1);
 
481
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
482
  aclIntArray* reduce_dims = aclCreateIntArray(dim, dim_size);
483
 
484
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src, reduce_dims, true,
485
  ggml_cann_type_mapping(dst->type), acl_dst);
486
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, reduce_dims);
 
 
487
  }
488
 
489
  void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
507
  std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
508
  auto output_size_array = aclCreateIntArray(output_size.data(), 2);
509
 
510
+ GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src, output_size_array, acl_dst);
511
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, output_size_array);
 
 
512
  }
513
 
514
  /**
 
531
  aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
532
  aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
533
 
534
+ GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad, acl_value, acl_dst);
535
+ ggml_cann_release_resources(ctx, acl_pad, acl_value);
 
536
  }
537
 
538
  void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
548
  0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
549
  0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
550
  aclnn_pad(ctx, acl_src, acl_dst, paddings);
551
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
 
 
552
  }
553
 
554
  /**
 
598
  cube_math_type = 1;
599
  #endif
600
 
601
+ GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src, kernel_size, strides, paddings_avg,
602
  ceil_mode, count_include_pad, divisor_override,
603
  cube_math_type, acl_dst);
604
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, kernel_size, strides,
605
+ paddings_avg);
 
 
 
606
  }
607
 
608
  /**
 
670
 
671
  bool ceil_mode = false;
672
  int64_t auto_pads = 0;
673
+ GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor, kernel_size, strides, auto_pads,
674
  paddings_max, dilations, ceil_mode, acl_dst);
675
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, tmp_tensor, kernel_size,
676
+ strides, paddings_max, dilations);
 
 
 
 
 
677
  }
678
 
679
  void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
704
  */
705
  static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
706
  aclTensor* acl_dst) {
707
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst, acl_src);
708
  }
709
 
710
  void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
722
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
723
  if (dst->type == src0->type) {
724
  size_t cpy_size = ggml_nbytes(dst);
725
+ ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
726
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
 
727
  return;
728
  } else {
729
  ggml_cann_pool_alloc src_buffer_allocator(
 
742
 
743
  aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
744
  size_t cpy_size = ggml_nbytes(dst);
745
+ ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
746
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
747
+ ggml_cann_release_resources(ctx, src_trans_tensor);
 
748
  return;
749
  }
750
  } else if (ggml_is_contiguous(dst)) {
 
764
  aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
765
 
766
  size_t cpy_size = ggml_nbytes(dst);
767
+ ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
768
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
769
+ ggml_cann_release_resources(ctx, src_trans_tensor);
 
770
  return;
771
  } else {
772
  GGML_ABORT("Unsupport dst is not tontiguous.");
773
  }
774
  }
775
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
 
 
776
  }
777
 
778
  /**
 
800
  nb[i] = nb[i - 1] * ne[i - 1];
801
  }
802
 
803
+ ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
804
  aclTensor* zero =
805
  ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
806
  return zero;
 
833
  float alpha_host = 1.0f;
834
  aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
835
  aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
836
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_tensor, other, alpha);
837
  return acl_tensor;
838
  }
839
 
 
859
  aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
860
  src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
861
  ggml_element_size(src));
862
+ GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
863
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
 
 
 
864
  }
865
 
866
  // TODO: performace is low.
 
886
  float alphaValue = 1.0f;
887
  alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
888
 
889
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor, n_past + 1);
890
+ GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src, n_past + 1, acl_dst);
891
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, mask_tensor, alpha);
892
+ ggml_cann_release_resources(ctx, alpha, acl_src, acl_dst, mask_tensor);
 
 
 
893
  }
894
 
895
  /**
 
910
  static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
911
  aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
912
  aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
913
+ GGML_CANN_CALL_ACLNN_OP(ctx, Permute, acl_src, acl_dims, acl_dst);
914
+ ggml_cann_release_resources(ctx, acl_dims);
915
  }
916
 
917
  static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
 
932
  aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
933
  }
934
 
935
+ ggml_cann_release_resources(ctx, acl_dst);
 
936
  }
937
 
938
  static void ggml_cann_im2col_1d_post_process(
 
954
 
955
  // Permute: [N, IC * KH * KW, OW * OH] ->
956
  // [N, OW * OH * n_bytes_factor, IC * KH * KW]
 
957
  ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
958
  tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
959
  void* tmp_permute_buffer = tmp_permute_allocator.get();
 
965
  tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
966
  }
967
 
968
+ aclTensor* tmp_permute_tensor = ggml_cann_create_tensor(
969
  tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
970
  ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
971
  GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
 
995
  c * KH * KW * n_step_w * ggml_type_size(dst->type);
996
 
997
  for (int i = 0; i < n_step_w; i++) {
998
+ ggml_cann_async_memcpy(ctx, cur_dst_buffer, cur_permute_buffer, size_cpy,
999
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
 
1000
  cur_dst_buffer =
1001
  (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
1002
  cur_permute_buffer = (char*)cur_permute_buffer +
 
1006
  } else {
1007
  offset = KH * KW * n_step_w *
1008
  ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
1009
+ ggml_cann_async_memcpy(ctx, dst->data, (char*)tmp_permute_buffer + offset, offset,
1010
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
 
1011
  }
1012
 
1013
+ ggml_cann_release_resources(ctx, tmp_permute_tensor);
 
1014
  }
1015
 
1016
  void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
1072
  auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
1073
  auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
1074
  auto* strides = aclCreateIntArray(stride_dims.data(), 2);
1075
+ GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1, kernel_size, dilations,
1076
  paddings, strides, tmp_im2col_tensor);
1077
 
1078
  // Cast if dst is f16.
 
1106
  tmp_im2col_tensor, im2col_op_params);
1107
  }
1108
 
1109
+ ggml_cann_release_resources(ctx, acl_src1, tmp_im2col_tensor, tmp_cast_tensor,
1110
+ kernel_size, dilations, paddings, strides);
 
 
 
 
 
 
1111
  }
1112
 
1113
  /**
 
1124
  * @param acl_src The tensor on which the exponential function will be applied.
1125
  */
1126
  static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
1127
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceExp, acl_src);
1128
  }
1129
 
1130
  void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1131
  aclTensor* acl_dst) {
1132
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cos, acl_src, acl_dst);
1133
  }
1134
 
1135
  void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1136
  aclTensor* acl_dst) {
1137
+ GGML_CANN_CALL_ACLNN_OP(ctx, Sin, acl_src, acl_dst);
1138
  }
1139
 
1140
  void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
 
1183
 
1184
  ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
1185
  void* tmp_permute_buffer = permute_allocator.get();
1186
+ aclTensor* tmp_permute_tensor = ggml_cann_create_tensor(
1187
  tmp_permute_buffer, ggml_cann_type_mapping(src->type),
1188
  ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb,
1189
  GGML_MAX_DIMS, ACL_FORMAT_ND);
1190
  int64_t permute_dim[] = {0, 1, 3, 2};
1191
  int64_t num_dims = 4;
1192
+ aclnn_permute(ctx, acl_src, tmp_permute_tensor, permute_dim, num_dims);
1193
 
1194
  // timestep * freq
1195
  int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
 
1210
  tmp_mul_buffer, ggml_cann_type_mapping(src->type),
1211
  ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1212
  ACL_FORMAT_ND);
1213
+ aclnn_mul(ctx, tmp_permute_tensor, tmp_arange_tensor, tmp_mul_tensor);
1214
 
1215
  // cos
1216
  ggml_cann_pool_alloc cos_allocator(
 
1238
  int64_t concat_dim = 3;
1239
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1240
  aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
1241
+ aclTensorList* tensor_list = aclCreateTensorList(tensors, 2);
1242
+ aclnn_concat(ctx, tensor_list, acl_dst, concat_dim);
1243
 
1244
  // release
1245
  // segmentation fault when delete both tensorList and his elements.
1246
+ ggml_cann_release_resources(ctx, tensor_list, acl_src, tmp_arange_tensor,
1247
+ tmp_permute_tensor, tmp_mul_tensor, acl_dst);
 
 
 
 
1248
  }
1249
 
1250
  /**
 
1260
  static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
1261
  aclTensor* acl_dst) {
1262
  auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
1263
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
1264
+ ggml_cann_release_resources(ctx, acl_scalar);
1265
  }
1266
 
1267
  /**
 
1282
  */
1283
  static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
1284
  aclTensor* acl_dst, aclTensor* acl_exp) {
1285
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplacePowTensorTensor, acl_dst, acl_exp);
1286
  }
1287
 
1288
  /**
 
1434
 
1435
  // add
1436
  aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
1437
+ ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
1438
+ tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
1439
+ tmp_arange_tensor, tmp_mk_tensor, tmp_output_tensor);
 
 
 
 
 
 
1440
  }
1441
 
1442
  void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
1459
  */
1460
  static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1461
  int64_t dim, aclTensor* acl_dst) {
1462
+ GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
1463
  }
1464
 
1465
  void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
1509
  src1_fp32_nb, GGML_MAX_DIMS);
1510
  aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
1511
  aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
1512
+ ggml_cann_release_resources(ctx, acl_src1);
 
1513
  } else {
1514
  acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
1515
  }
 
1562
 
1563
  // softmax
1564
  aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
1565
+ ggml_cann_release_resources(ctx, alibi_output_tensor);
1566
  } else {
1567
  aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
1568
  }
1569
 
1570
+ ggml_cann_release_resources(ctx, acl_src0, acl_src1_fp32_tensor, acl_dst,
1571
+ acl_scale, acl_input_mul_scale_tensor, tmp_mask_tensor);
 
 
 
 
1572
  }
1573
 
1574
  /**
 
1615
  (char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
1616
  ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
1617
  acl_out_ne, acl_out_nb, 2);
1618
+ GGML_CANN_CALL_ACLNN_OP(ctx, Embedding, acl_src_tensor, acl_index, acl_out);
1619
+ ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
 
 
1620
  }
1621
  }
1622
  }
 
1647
  aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
1648
  aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
1649
  src_trans_nb, src1, dst);
1650
+ ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
 
1651
  break;
1652
  }
1653
  case GGML_TYPE_Q8_0: {
 
1709
  aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
1710
  dequant_ne, dequant_nb, src1, dst);
1711
 
1712
+ ggml_cann_release_resources(ctx, dequant_tensor);
1713
  break;
1714
  }
1715
  default:
 
1737
  aclTensor* acl_src, aclTensor* acl_dst,
1738
  int64_t dim, int64_t repeats,
1739
  int64_t output_size) {
1740
+ GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim,
1741
  output_size, acl_dst);
1742
  }
1743
 
 
1786
 
1787
  switch (n_dims) {
1788
  case 2:
1789
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mm, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
1790
  break;
1791
  case 3:
1792
+ GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
1793
  break;
1794
  default:
1795
  // ALLOW_FP32_DOWN_PRECISION, when input is
1796
  // fp32, atlas a2 will transpose it to HFLOAT32.
1797
+ GGML_CANN_CALL_ACLNN_OP(ctx, Matmul, acl_input_tensor, acl_weight_tensor, acl_dst, 1);
1798
  break;
1799
  }
1800
 
1801
+ ggml_cann_release_resources(ctx, acl_weight_tensor, acl_input_tensor, acl_dst);
 
 
1802
  }
1803
 
1804
  /**
 
1868
  input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
1869
  input_cast_nb, GGML_MAX_DIMS);
1870
  aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
1871
+ ggml_cann_release_resources(ctx, acl_input_tensor, acl_src1_tensor);
 
 
1872
  }
1873
 
1874
  // output
 
1921
  if (src0->ne[0] > QK8_0) {
1922
  antiquantGroupSize = QK8_0;
1923
  }
1924
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
1925
  acl_weight_tensor, acl_scale_tensor, nullptr,
1926
  nullptr, nullptr, nullptr, antiquantGroupSize,
1927
  acl_output_tensor);
1928
+ ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
 
 
1929
 
1930
  // other splits
1931
  for (int64_t split = 1; split < split_size; split++) {
 
1952
  (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
1953
  output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
1954
  output_ne_offset);
1955
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
1956
  acl_weight_tensor, acl_scale_tensor, nullptr,
1957
  nullptr, nullptr, nullptr, antiquantGroupSize,
1958
  acl_output_tensor);
1959
+ ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
 
 
1960
  }
1961
 
1962
+ ggml_cann_release_resources(ctx, acl_input_tensor);
1963
  }
1964
  }
1965
 
 
1978
  aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
1979
  aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
1980
 
1981
+ ggml_cann_release_resources(ctx, acl_output_tensor, acl_dst_tensor);
 
1982
  }
1983
  }
1984
 
 
2019
  aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
2020
  aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
2021
  aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
2022
+ GGML_CANN_CALL_ACLNN_OP(ctx, Roll, acl_src, acl_shifts, acl_dims, acl_dst);
2023
+ ggml_cann_release_resources(ctx, acl_shifts, acl_dims);
 
2024
  }
2025
 
2026
  /**
 
2042
  float value) {
2043
  aclIntArray* acl_index = aclCreateIntArray(index, index_num);
2044
  aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
2045
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexFillTensor, acl_src, dim, acl_index, acl_value);
2046
+ ggml_cann_release_resources(ctx, acl_index, acl_value);
 
2047
  }
2048
 
2049
  static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
 
2080
 
2081
  // power
2082
  aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
2083
+ GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor,
2084
+ acl_theta_scale_tensor);
2085
 
2086
  // freq_scale
2087
  if (freq_scale != 1) {
 
2094
  src2->data, ggml_cann_type_mapping(src2->type),
2095
  ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2096
  aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor);
2097
+ ggml_cann_release_resources(ctx, acl_freq_factors_tensor);
2098
  }
2099
 
2100
  // position
 
2163
  }
2164
 
2165
  // release
2166
+ ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor,
2167
+ acl_theta_tensor, acl_sin_tensor, acl_cos_tensor, acl_theta_scale);
 
 
 
 
2168
  }
2169
 
2170
  #ifdef __cplusplus
 
2276
  int64_t shifts[] = {1};
2277
  int64_t dims[] = {3};
2278
  aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2279
+ ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
 
2280
 
2281
  // init [-1, 1, -1, 1, ...]
2282
  minus_one_scale_buffer = minus_one_scale_allocator.get();
 
2312
  int64_t dims[] = {3};
2313
  aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2314
 
2315
+ ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
 
2316
  // init [-1, -1, -1, 1, 1,1,...]
2317
  minus_one_scale_buffer = minus_one_scale_allocator.get();
2318
  int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
 
2337
  bool inplace = true;
2338
  float scale = -1;
2339
  aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
2340
+ ggml_cann_release_resources(ctx, acl_first_half_tensor);
2341
  }
2342
 
2343
  // TODO: n_dims < ne0
 
2402
  output_fp32_tensor);
2403
  aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
2404
 
2405
+ ggml_cann_release_resources(ctx, input_fp32_tensor1, input_fp32_tensor2,
2406
+ output_fp32_tensor, acl_sin_reshape_tensor,
2407
+ acl_minus_one_tensor, acl_input_roll_mul_scale_tensor,
2408
+ acl_input_roll_reshape_tensor, acl_src);
 
 
 
 
2409
  }
2410
  return;
2411
  #endif
 
2415
 
2416
  switch (src0->type) {
2417
  case GGML_TYPE_F32: {
2418
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src,
2419
+ acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst);
2420
  break;
2421
  }
2422
  case GGML_TYPE_F16: {
 
2442
 
2443
  aclnn_cast(ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT);
2444
 
2445
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor,
2446
+ acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
2447
+ acl_dst_trans_tensor);
2448
 
2449
  aclnn_cast(ctx, acl_dst_trans_tensor, acl_dst, ACL_FLOAT16);
2450
 
2451
+ ggml_cann_release_resources(ctx, acl_src_trans_tensor,
2452
+ acl_dst_trans_tensor);
2453
  break;
2454
  }
2455
  default:
2456
  GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
2457
  break;
2458
  }
2459
+ ggml_cann_release_resources(ctx, acl_cos_reshape_tensor,
2460
+ acl_sin_reshape_tensor, acl_src, acl_dst);
 
 
2461
  }
2462
 
2463
 
 
2467
  aclTensor* acl_src = ggml_cann_create_tensor(src0);
2468
  aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
2469
 
2470
+ GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src, 3, false, acl_dst);
2471
 
2472
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
 
2473
  }
2474
 
2475
  void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
 
2498
  cubeMathType = 1;
2499
  #endif
2500
 
2501
+ GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride,
2502
  padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
2503
 
2504
+ ggml_cann_release_resources(ctx, acl_weight, acl_dst, stride, padding, dilation);
 
 
 
 
2505
  }
2506
 
2507
  void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
 
2514
  aclScalar* alpha = nullptr;
2515
  alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2516
 
2517
+ GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input, alpha, alpha, alpha,
2518
  acl_dst);
2519
 
2520
+ ggml_cann_release_resources(ctx, acl_input, acl_dst, alpha);
 
 
2521
  }
2522
 
2523
  void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst){
 
2530
  aclIntArray* reduceDim = aclCreateIntArray(reduceDimValue, 1);
2531
  bool keepDim = true;
2532
 
2533
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src, reduceDim, keepDim, ACL_FLOAT, acl_dst);
2534
 
2535
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, reduceDim);
 
 
2536
  }
2537
 
2538
  void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
 
2552
  ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
2553
  dst->ne, dst->nb, 3);
2554
 
2555
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src, paddings, acl_dst);
2556
 
2557
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
 
2558
  }
2559
+ ggml_cann_release_resources(ctx, paddings);
2560
  }
2561
 
2562
  void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst){
 
2566
  aclTensor* acl_self = ggml_cann_create_tensor(src0);
2567
  aclTensor* acl_other = ggml_cann_create_tensor(src1);
2568
 
2569
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self, acl_other);
2570
 
2571
  ggml_cann_sum(ctx, dst);
2572
 
2573
+ ggml_cann_release_resources(ctx, acl_self, acl_other);
 
2574
  }
2575
 
2576
  void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
 
2583
  aclScalar* alpha = nullptr;
2584
  alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2585
 
2586
+ GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src, alpha, acl_dst);
2587
 
2588
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
 
 
2589
  }
ggml/src/ggml-cann/aclnn_ops.h CHANGED
@@ -23,6 +23,7 @@
23
  #ifndef CANN_ACLNN_OPS
24
  #define CANN_ACLNN_OPS
25
 
 
26
  #include <aclnnop/aclnn_abs.h>
27
  #include <aclnnop/aclnn_neg.h>
28
  #include <aclnnop/aclnn_exp.h>
@@ -713,6 +714,270 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
713
  */
714
  void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
715
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  /**
717
  * @brief Applies a element-wise operation to two input tensors using the CANN
718
  * backend.
@@ -742,42 +1007,9 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
742
  bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
743
  binary_op(ctx, acl_src0, acl_src1, acl_dst);
744
 
745
- ACL_CHECK(aclDestroyTensor(acl_src0));
746
- ACL_CHECK(aclDestroyTensor(acl_src1));
747
- ACL_CHECK(aclDestroyTensor(acl_dst));
748
  }
749
 
750
- /**
751
- * @brief Launches an asynchronous task using the memory allocator.
752
- *
753
- * This macro submit an asynchronous task on the specified stream.
754
- * The task uses memory allocated by the allocator. It is guaranteed
755
- * that the memory will not be accessed by other tasks until this task
756
- * completes, due to the sequential execution order within the same stream.
757
- *
758
- * @param OP_NAME aclnn operator name.
759
- * @param args Additional arguments required by the task.
760
- *
761
- * @note
762
- * Memory from the allocator will be "freed" immediately and can be
763
- * reallocated to other pointers. However, it won't be accessed by any
764
- * other task before this asynchronous task ends, because all tasks in the
765
- * same stream are executed in queue order.
766
- */
767
- #define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...) \
768
- do { \
769
- uint64_t workspaceSize = 0; \
770
- aclOpExecutor * executor; \
771
- void * workspaceAddr = nullptr; \
772
- \
773
- ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
774
- \
775
- if (workspaceSize > 0) { \
776
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); \
777
- workspaceAddr = workspace_allocator.get(); \
778
- } \
779
- ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream())); \
780
- } while (0)
781
 
782
  /**
783
  * @brief Applies a unary operation to an input tensor using the CANN backend.
@@ -799,9 +1031,7 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
799
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
800
 
801
  unary_op(ctx, acl_src, acl_dst);
802
-
803
- ACL_CHECK(aclDestroyTensor(acl_src));
804
- ACL_CHECK(aclDestroyTensor(acl_dst));
805
  }
806
 
807
  /**
@@ -832,7 +1062,7 @@ void ggml_cann_unary_op(
832
  *
833
  * Internally, the lambda will call:
834
  * @code
835
- * GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);
836
  * @endcode
837
  *
838
  * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
@@ -840,14 +1070,14 @@ void ggml_cann_unary_op(
840
  * @see ggml_cann_unary_op
841
  * @see GGML_CANN_CALL_ACLNN_OP
842
  */
843
- #define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
844
- do { \
845
- auto lambda = [](ggml_backend_cann_context& ctx, \
846
- aclTensor* acl_src, \
847
- aclTensor* acl_dst) { \
848
- GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst); \
849
- }; \
850
- ggml_cann_unary_op(lambda, ctx, dst); \
851
- } \
852
  while (0)
853
  #endif // CANN_ACLNN_OPS
 
23
  #ifndef CANN_ACLNN_OPS
24
  #define CANN_ACLNN_OPS
25
 
26
+ #include <functional>
27
  #include <aclnnop/aclnn_abs.h>
28
  #include <aclnnop/aclnn_neg.h>
29
  #include <aclnnop/aclnn_exp.h>
 
714
  */
715
  void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
716
 
717
+ /*
718
+ * @brief A generic wrapper for ACL resources with custom deleter support.
719
+ */
720
+ using any_acl_resource = std::unique_ptr<void, std::function<void(void*)>>;
721
+
722
+ /**
723
+ * @brief Trait structure used to define how to destroy a given ACL resource type.
724
+ *
725
+ * @tparam T ACL resource type.
726
+ */
727
+ template<typename T>
728
+ struct acl_resource_traits;
729
+
730
+ /**
731
+ * @brief Specialization for aclTensor, defines how to destroy an aclTensor resource.
732
+ */
733
+ template<>
734
+ struct acl_resource_traits<aclTensor> {
735
+ static void destroy(void* p) {
736
+ ACL_CHECK(aclDestroyTensor(static_cast<aclTensor*>(p)));
737
+ }
738
+ };
739
+
740
+ /**
741
+ * @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource.
742
+ */
743
+ template<>
744
+ struct acl_resource_traits<aclIntArray> {
745
+ static void destroy(void* p) {
746
+ ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray*>(p)));
747
+ }
748
+ };
749
+
750
+ /**
751
+ * @brief Specialization for aclScalar, defines how to destroy an aclScalar resource.
752
+ */
753
+ template<>
754
+ struct acl_resource_traits<aclScalar> {
755
+ static void destroy(void* p) {
756
+ ACL_CHECK(aclDestroyScalar(static_cast<aclScalar*>(p)));
757
+ }
758
+ };
759
+
760
+ /**
761
+ * @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource.
762
+ */
763
+ template<>
764
+ struct acl_resource_traits<aclTensorList> {
765
+ static void destroy(void* p) {
766
+ ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList*>(p)));
767
+ }
768
+ };
769
+
770
+ /**
771
+ * @brief Creates a generic ACL resource wrapper with proper destruction logic.
772
+ *
773
+ * @tparam T ACL resource type.
774
+ * @param ptr Raw pointer to ACL resource.
775
+ * @return any_acl_resource Smart pointer that handles destruction.
776
+ */
777
+ template<typename T>
778
+ any_acl_resource make_acl_resource(T* ptr) {
779
+ return any_acl_resource(
780
+ static_cast<void*>(ptr),
781
+ [](void* p) {
782
+ acl_resource_traits<T>::destroy(p);
783
+ }
784
+ );
785
+ }
786
+
787
+ /**
788
+ * @brief Registers multiple ACL resources into a vector for lifetime management.
789
+ *
790
+ * @tparam Args Variadic list of ACL resource types.
791
+ * @param vec Target vector to hold ACL resources.
792
+ * @param args Raw pointers to ACL resources.
793
+ */
794
+ template<typename... Args>
795
+ void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
796
+ (vec.emplace_back(make_acl_resource(args)), ...);
797
+ }
798
+
799
+ /**
800
+ * @brief Task class that wraps the execution of an aclnn function call.
801
+ */
802
+ class aclnn_task : public cann_task {
803
+ public:
804
+ aclnn_task(aclnn_func_t aclnn_func, void * workspace_addr,
805
+ uint64_t workspace_size, aclOpExecutor * executor,
806
+ aclrtStream stream) :
807
+ aclnn_func_(aclnn_func),
808
+ workspace_addr_(workspace_addr),
809
+ workspace_size_(workspace_size),
810
+ executor_(executor),
811
+ stream_(stream) {}
812
+ virtual void run_task() override {
813
+ ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_));
814
+ }
815
+ private:
816
+ aclnn_func_t aclnn_func_;
817
+ void * workspace_addr_;
818
+ uint64_t workspace_size_;
819
+ aclOpExecutor * executor_;
820
+ aclrtStream stream_;
821
+ };
822
+
823
+ /**
824
+ * @brief Task class that releases ACL resources after usage.
825
+ */
826
+ class release_resource_task : public cann_task {
827
+ public:
828
+ release_resource_task(std::vector<any_acl_resource>&& resources){
829
+ resource_ = std::move(resources);
830
+ }
831
+
832
+ virtual void run_task() override {
833
+ resource_.clear();
834
+ }
835
+ private:
836
+ std::vector<any_acl_resource> resource_;
837
+ };
838
+
839
+ /**
840
+ * @brief Task class for performing asynchronous memory copy operations.
841
+ */
842
+ class async_memcpy_task : public cann_task {
843
+ public:
844
+ async_memcpy_task(void* dst, const void* src, size_t size,
845
+ aclrtMemcpyKind kind, aclrtStream stream)
846
+ : dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
847
+
848
+ virtual void run_task() override {
849
+ ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_));
850
+ }
851
+ private:
852
+ void* dst_;
853
+ const void* src_;
854
+ size_t size_;
855
+ aclrtMemcpyKind kind_;
856
+ aclrtStream stream_;
857
+ };
858
+
859
+ /**
860
+ * @brief Task class for performing asynchronous memory set operations.
861
+ */
862
+ class async_memset_task : public cann_task {
863
+ public:
864
+ async_memset_task(void* buffer, size_t size, int32_t value, aclrtStream stream)
865
+ : buffer_(buffer), size_(size), value_(value), stream_(stream) {}
866
+
867
+ virtual void run_task() override {
868
+ ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_));
869
+ }
870
+ private:
871
+ void* buffer_;
872
+ size_t size_;
873
+ int32_t value_;
874
+ aclrtStream stream_;
875
+ };
876
+
877
+ /**
878
+ * @brief Launches an asynchronous task using the memory allocator.
879
+ *
880
+ * This macro submit an asynchronous task on the specified stream.
881
+ * The task uses memory allocated by the allocator. It is guaranteed
882
+ * that the memory will not be accessed by other tasks until this task
883
+ * completes, due to the sequential execution order within the same stream.
884
+ *
885
+ * @param OP_NAME aclnn operator name.
886
+ * @param args Additional arguments required by the task.
887
+ *
888
+ * @note
889
+ * Memory from the allocator will be "freed" immediately and can be
890
+ * reallocated to other pointers. However, it won't be accessed by any
891
+ * other task before this asynchronous task ends, because all tasks in the
892
+ * same stream are executed in queue order.
893
+ */
894
+
895
+ #define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \
896
+ do { \
897
+ uint64_t workspaceSize = 0; \
898
+ aclOpExecutor * executor; \
899
+ void * workspaceAddr = nullptr; \
900
+ ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));\
901
+ /* workspace should alloced in main thread to keep malloc order when using vmm. */ \
902
+ if (workspaceSize > 0) { \
903
+ ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \
904
+ workspaceAddr = workspace_allocator.get(); \
905
+ } \
906
+ if (CTX.async_mode) { \
907
+ auto task = \
908
+ std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize, \
909
+ executor, CTX.stream()); \
910
+ CTX.task_queue.submit_task(std::move(task)); \
911
+ } else { \
912
+ ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));\
913
+ } \
914
+ } while (0)
915
+
916
+ /**
917
+ * @brief Registers and releases multiple ACL resources, optionally deferring the release
918
+ * using a task.
919
+ *
920
+ * @tparam Args Types of the ACL resources.
921
+ * @param ctx Backend context which manages task submission and async mode.
922
+ * @param args Pointers to ACL resources to be released.
923
+ */
924
+ template <typename... Args>
925
+ void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
926
+ std::vector<any_acl_resource> resources;
927
+ register_acl_resources(resources, std::forward<Args>(args)...);
928
+ if(ctx.async_mode) {
929
+ auto task = std::make_unique<release_resource_task>(std::move(resources));
930
+ ctx.task_queue.submit_task(std::move(task));
931
+ }
932
+ }
933
+
934
+ /**
935
+ * @brief Performs an asynchronous memory copy operation, optionally deferred via task submission.
936
+ *
937
+ * @param ctx Backend context containing stream and async configuration.
938
+ * @param dst Destination memory address.
939
+ * @param src Source memory address.
940
+ * @param len Size of memory to copy (in bytes).
941
+ * @param kind Type of memory copy (host-to-device, device-to-host, etc).
942
+ */
943
+ inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
944
+ const void * src, size_t len, aclrtMemcpyKind kind) {
945
+ if (ctx.async_mode) {
946
+ auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx.stream());
947
+ ctx.task_queue.submit_task(std::move(task));
948
+ } else {
949
+ ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx.stream()));
950
+ }
951
+ }
952
+
953
+ inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
954
+ const void * src, size_t len, aclrtMemcpyKind kind) {
955
+ if (ctx->async_mode) {
956
+ auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx->stream());
957
+ ctx->task_queue.submit_task(std::move(task));
958
+ } else {
959
+ ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx->stream()));
960
+ }
961
+ }
962
+
963
+ /**
964
+ * @brief Performs an asynchronous memory set operation, optionally deferred via task submission.
965
+ *
966
+ * @param ctx Backend context containing stream and async configuration.
967
+ * @param buffer Memory buffer to be set.
968
+ * @param size Size of the memory buffer (in bytes).
969
+ * @param value Value to set in the buffer.
970
+ */
971
+ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer,
972
+ size_t size, int value) {
973
+ if (ctx.async_mode) {
974
+ auto task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream());
975
+ ctx.task_queue.submit_task(std::move(task));
976
+ } else {
977
+ ACL_CHECK(aclrtMemsetAsync(buffer, size, value, size, ctx.stream()));
978
+ }
979
+ }
980
+
981
  /**
982
  * @brief Applies a element-wise operation to two input tensors using the CANN
983
  * backend.
 
1007
  bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
1008
  binary_op(ctx, acl_src0, acl_src1, acl_dst);
1009
 
1010
+ ggml_cann_release_resources(ctx, acl_src0, acl_src1, acl_dst);
 
 
1011
  }
1012
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1013
 
1014
  /**
1015
  * @brief Applies a unary operation to an input tensor using the CANN backend.
 
1031
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1032
 
1033
  unary_op(ctx, acl_src, acl_dst);
1034
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
 
 
1035
  }
1036
 
1037
  /**
 
1062
  *
1063
  * Internally, the lambda will call:
1064
  * @code
1065
+ * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1066
  * @endcode
1067
  *
1068
  * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
 
1070
  * @see ggml_cann_unary_op
1071
  * @see GGML_CANN_CALL_ACLNN_OP
1072
  */
1073
+ #define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
1074
+ do { \
1075
+ auto lambda = [](ggml_backend_cann_context& ctx, \
1076
+ aclTensor* acl_src, \
1077
+ aclTensor* acl_dst) { \
1078
+ GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
1079
+ }; \
1080
+ ggml_cann_unary_op(lambda, ctx, dst); \
1081
+ } \
1082
  while (0)
1083
  #endif // CANN_ACLNN_OPS
ggml/src/ggml-cann/common.h CHANGED
@@ -31,9 +31,16 @@
31
  #include <memory>
32
  #include <string>
33
  #include <vector>
 
 
 
 
 
 
34
 
35
  #include "../include/ggml-cann.h"
36
  #include "../include/ggml.h"
 
37
 
38
  #define MATRIX_ROW_PADDING 512
39
  #define GGML_CANN_MAX_STREAMS 8
@@ -205,6 +212,127 @@ struct ggml_cann_pool_alloc {
205
  ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
206
  };
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  /**
209
  * @brief Context for managing CANN backend operations.
210
  */
@@ -213,6 +341,8 @@ struct ggml_backend_cann_context {
213
  std::string name; /**< Name of the device. */
214
  std::string description; /**< Description of the device. */
215
  aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
 
 
216
 
217
  aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
218
 
@@ -221,9 +351,12 @@ struct ggml_backend_cann_context {
221
  * @param device Device ID.
222
  */
223
  explicit ggml_backend_cann_context(int device)
224
- : device(device), name("CANN" + std::to_string(device)) {
225
  ggml_cann_set_device(device);
226
  description = aclrtGetSocName();
 
 
 
227
  }
228
 
229
  /**
@@ -231,6 +364,7 @@ struct ggml_backend_cann_context {
231
  */
232
  ~ggml_backend_cann_context() {
233
  ggml_cann_set_device(device);
 
234
  if (copy_event != nullptr) {
235
  ACL_CHECK(aclrtDestroyEvent(copy_event));
236
  }
 
31
  #include <memory>
32
  #include <string>
33
  #include <vector>
34
+ #include <atomic>
35
+ #include <condition_variable>
36
+ #include <mutex>
37
+ #include <thread>
38
+ #include <unistd.h>
39
+ #include <functional>
40
 
41
  #include "../include/ggml-cann.h"
42
  #include "../include/ggml.h"
43
+ #include "../ggml-impl.h"
44
 
45
  #define MATRIX_ROW_PADDING 512
46
  #define GGML_CANN_MAX_STREAMS 8
 
212
  ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
213
  };
214
 
215
+ /**
216
+ * @brief Function pointer type for ACLNN operator calls.
217
+ */
218
+ using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStream);
219
+
220
+ /**
221
+ * @brief Base class for all CANN tasks to be submitted to the task queue.
222
+ *
223
+ * Users should override the run_task() method with actual task logic.
224
+ */
225
+ class cann_task {
226
+ public:
227
+ virtual void run_task() {}
228
+ };
229
+
230
+ /**
231
+ * @brief A lock-free ring-buffer based task queue for asynchronously executing cann_task instances.
232
+ */
233
+ class cann_task_queue {
234
+ public:
235
+ /**
236
+ * @brief Constructs a task queue with a fixed power-of-two capacity for a specific device.
237
+ *
238
+ * @param capacity Queue capacity. Must be a power of 2.
239
+ * @param device Target device ID (used for context setting).
240
+ */
241
+ explicit cann_task_queue(size_t capacity, int32_t device)
242
+ : buffer_(capacity), capacity_(capacity), head_(0), tail_(0),
243
+ running_(false), device_(device) {
244
+ GGML_ASSERT((capacity & (capacity - 1)) == 0 && "capacity must be power of 2");
245
+ mask_ = capacity_ - 1;
246
+ }
247
+
248
+ /**
249
+ * @brief Attempts to enqueue a task into the queue.
250
+ *
251
+ * @param item Unique pointer to the task.
252
+ * @return true if the task was successfully enqueued, false if the queue was full.
253
+ */
254
+ bool enqueue(std::unique_ptr<cann_task>&& item) {
255
+ size_t next_tail = (tail_ + 1) & mask_;
256
+
257
+ if (next_tail == head_) {
258
+ return false;
259
+ }
260
+
261
+ buffer_[tail_] = std::move(item);
262
+ std::atomic_thread_fence(std::memory_order_release);
263
+ tail_ = next_tail;
264
+
265
+ return true;
266
+ }
267
+
268
+ /**
269
+ * @brief Submits a task to the queue, and starts the worker thread if not already running.
270
+ *
271
+ * @param task Task to be submitted.
272
+ */
273
+ void submit_task(std::unique_ptr<cann_task>&& task) {
274
+ while(!enqueue(std::move(task))) {
275
+ std::this_thread::yield();
276
+ continue;
277
+ }
278
+
279
+ if (!running_) {
280
+ running_ = true;
281
+ thread_ = std::thread(&cann_task_queue::execute, this);
282
+ }
283
+
284
+ }
285
+
286
+ /**
287
+ * @brief Waits until the queue is completely empty and no tasks are being processed.
288
+ */
289
+ void wait() {
290
+ while (running_ && head_ != tail_) {
291
+ std::this_thread::yield();
292
+ continue;
293
+ }
294
+ }
295
+
296
+ /**
297
+ * @brief Stops the task queue and joins the worker thread.
298
+ */
299
+ void stop() {
300
+ running_ = false;
301
+ if (thread_.joinable()) {
302
+ thread_.join();
303
+ }
304
+ }
305
+
306
+ private:
307
+ /**
308
+ * @brief Worker thread function that continuously dequeues and executes tasks.
309
+ */
310
+ void execute() {
311
+ ggml_cann_set_device(device_);
312
+
313
+ while (running_) {
314
+ if(head_ == tail_) {
315
+ std::this_thread::yield();
316
+ continue;
317
+ }
318
+
319
+ std::atomic_thread_fence(std::memory_order_acquire);
320
+ buffer_[head_]->run_task();
321
+ buffer_[head_].reset();
322
+ head_ = (head_ + 1) & mask_;
323
+ }
324
+ }
325
+
326
+ std::vector<std::unique_ptr<cann_task>> buffer_;
327
+ const size_t capacity_;
328
+ size_t mask_;
329
+ size_t head_;
330
+ size_t tail_;
331
+ bool running_;
332
+ std::thread thread_;
333
+ int32_t device_;
334
+ };
335
+
336
  /**
337
  * @brief Context for managing CANN backend operations.
338
  */
 
341
  std::string name; /**< Name of the device. */
342
  std::string description; /**< Description of the device. */
343
  aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
344
+ cann_task_queue task_queue;
345
+ bool async_mode;
346
 
347
  aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
348
 
 
351
  * @param device Device ID.
352
  */
353
  explicit ggml_backend_cann_context(int device)
354
+ : device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
355
  ggml_cann_set_device(device);
356
  description = aclrtGetSocName();
357
+ async_mode = (getenv("GGML_CANN_ASYNC_MODE") != nullptr);
358
+ GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
359
+ device, async_mode ? "ON" : "OFF");
360
  }
361
 
362
  /**
 
364
  */
365
  ~ggml_backend_cann_context() {
366
  ggml_cann_set_device(device);
367
+ task_queue.stop();
368
  if (copy_event != nullptr) {
369
  ACL_CHECK(aclrtDestroyEvent(copy_event));
370
  }
ggml/src/ggml-cann/ggml-cann.cpp CHANGED
@@ -1606,7 +1606,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1606
  auto lambda = [](ggml_backend_cann_context& ctx,
1607
  aclTensor* acl_src,
1608
  aclTensor* acl_dst) {
1609
- GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
1610
  };
1611
  ggml_cann_unary_op(lambda, ctx, dst);
1612
  } break;
@@ -1789,12 +1789,11 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
1789
  delete backend;
1790
  }
1791
 
 
1792
  /**
1793
  * @brief Sets tensor data asynchronously in the CANN backend.
1794
  *
1795
- * This function asynchronously sets tensor data in the CANN backend. Depending
1796
- * on the tensor type, it may perform data transformations before copying data
1797
- * to the device.
1798
  *
1799
  * @param backend Pointer to the CANN backend structure.
1800
  * @param tensor Pointer to the tensor structure to set data for.
@@ -1809,23 +1808,28 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1809
  size_t size) {
1810
  ggml_backend_cann_context *cann_ctx =
1811
  (ggml_backend_cann_context *)backend->context;
 
 
1812
 
1813
- if (!need_transform(tensor->type)) {
1814
- ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
1815
- size, ACL_MEMCPY_HOST_TO_DEVICE,
1816
- cann_ctx->stream()));
1817
- } else {
1818
- void *transform_buffer = malloc(size);
1819
- ggml_backend_cann_transform(tensor, data, transform_buffer);
1820
 
1821
- ACL_CHECK(aclrtMemcpyAsync(
1822
- (char *)tensor->data + offset, size, transform_buffer, size,
1823
- ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
1824
- ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1825
- free(transform_buffer);
1826
- }
1827
  }
1828
 
 
 
 
 
 
 
 
 
 
 
 
1829
  static void ggml_backend_cann_get_tensor_async(
1830
  ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1831
  size_t offset, size_t size) {
@@ -1836,20 +1840,11 @@ static void ggml_backend_cann_get_tensor_async(
1836
 
1837
  GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1838
  "unsupported buffer type");
 
 
 
 
1839
 
1840
- if (!need_transform(tensor->type)) {
1841
- ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
1842
- size, ACL_MEMCPY_DEVICE_TO_HOST,
1843
- cann_ctx->stream()));
1844
- } else {
1845
- void *transform_buffer = malloc(size);
1846
- ACL_CHECK(aclrtMemcpyAsync(
1847
- transform_buffer, size, (char *)tensor->data + offset, size,
1848
- ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
1849
- ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1850
- ggml_backend_cann_transform_back(tensor, transform_buffer, data);
1851
- free(transform_buffer);
1852
- }
1853
  }
1854
 
1855
  /**
@@ -1909,6 +1904,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
1909
  ggml_cann_set_device(cann_ctx_src->device);
1910
  ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
1911
 
 
 
1912
  ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
1913
  ACL_MEMCPY_DEVICE_TO_DEVICE,
1914
  cann_ctx_src->stream()));
@@ -1936,9 +1933,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
1936
  static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1937
  ggml_backend_cann_context* cann_ctx =
1938
  (ggml_backend_cann_context*)backend->context;
1939
-
1940
  ggml_cann_set_device(cann_ctx->device);
1941
-
1942
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1943
  }
1944
 
 
1606
  auto lambda = [](ggml_backend_cann_context& ctx,
1607
  aclTensor* acl_src,
1608
  aclTensor* acl_dst) {
1609
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1610
  };
1611
  ggml_cann_unary_op(lambda, ctx, dst);
1612
  } break;
 
1789
  delete backend;
1790
  }
1791
 
1792
+
1793
  /**
1794
  * @brief Sets tensor data asynchronously in the CANN backend.
1795
  *
1796
+ * This function asynchronously sets tensor data in the CANN backend.
 
 
1797
  *
1798
  * @param backend Pointer to the CANN backend structure.
1799
  * @param tensor Pointer to the tensor structure to set data for.
 
1808
  size_t size) {
1809
  ggml_backend_cann_context *cann_ctx =
1810
  (ggml_backend_cann_context *)backend->context;
1811
+ ggml_backend_buffer_t buf =
1812
+ tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1813
 
1814
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1815
+ "unsupported buffer type");
1816
+ GGML_ASSERT(!ggml_is_quantized(tensor->type));
 
 
 
 
1817
 
1818
+ ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
1819
+ ACL_MEMCPY_HOST_TO_DEVICE);
 
 
 
 
1820
  }
1821
 
1822
+ /**
1823
+ * @brief Gets tensor data asynchronously in the CANN backend.
1824
+ *
1825
+ * This function asynchronously gets tensor data in the CANN backend.
1826
+ *
1827
+ * @param backend Pointer to the CANN backend structure.
1828
+ * @param tensor Pointer to the tensor structure to get data from.
1829
+ * @param data Pointer to the host data to copy from the tensor.
1830
+ * @param offset Offset in bytes within the host data.
1831
+ * @param size Size of the data to copy in bytes.
1832
+ */
1833
  static void ggml_backend_cann_get_tensor_async(
1834
  ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1835
  size_t offset, size_t size) {
 
1840
 
1841
  GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1842
  "unsupported buffer type");
1843
+ GGML_ASSERT(!ggml_is_quantized(tensor->type));
1844
+
1845
+ ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
1846
+ ACL_MEMCPY_DEVICE_TO_HOST);
1847
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1848
  }
1849
 
1850
  /**
 
1904
  ggml_cann_set_device(cann_ctx_src->device);
1905
  ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
1906
 
1907
+ // wait for task_queue empty to keep task order.
1908
+ cann_ctx_src->task_queue.wait();
1909
  ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
1910
  ACL_MEMCPY_DEVICE_TO_DEVICE,
1911
  cann_ctx_src->stream()));
 
1933
  static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1934
  ggml_backend_cann_context* cann_ctx =
1935
  (ggml_backend_cann_context*)backend->context;
1936
+ cann_ctx->task_queue.wait();
1937
  ggml_cann_set_device(cann_ctx->device);
 
1938
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1939
  }
1940