Shanshan Shen shanshan shen Frank Mai commited on
Commit
f9fd6d6
·
1 Parent(s): b357ea7

CANN: Improve the Inferencing Performance for Ascend NPU Device (llama/10454)

Browse files

* improve inferencing performance for ascend npu.

Co-authored-by: Frank Mai <thxCode@[email protected]>

* some modification after review

* some modifications after review

* restore some modifications

* restore some modifications

---------

Co-authored-by: shanshan shen <[email protected]>
Co-authored-by: Frank Mai <thxCode@[email protected]>

ggml/src/ggml-cann/aclnn_ops.cpp CHANGED
@@ -33,6 +33,8 @@
33
  #include <aclnnop/aclnn_group_norm.h>
34
  #include <aclnnop/aclnn_index_fill_tensor.h>
35
  #include <aclnnop/aclnn_layer_norm.h>
 
 
36
  #include <aclnnop/aclnn_matmul.h>
37
  #include <aclnnop/aclnn_max_pool.h>
38
  #include <aclnnop/aclnn_permute.h>
@@ -2423,7 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
2423
  aclTensor* acl_weight, aclTensor* acl_dst) {
2424
  int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
2425
  // fp32, atlas a2 will transpose it to HFLOAT32.
2426
-
2427
  uint64_t workspaceSize = 0;
2428
  aclOpExecutor* executor;
2429
  void* workspaceAddr = nullptr;
@@ -2441,6 +2442,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
2441
  aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
2442
  }
2443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2444
  /**
2445
  * @brief Performs matrix multiplication with floating-point precision on
2446
  * tensors using the CANN backend.
@@ -2462,20 +2537,43 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
2462
  // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
2463
  BCAST_MUL_MAT_SHAPE(input, weight, dst);
2464
 
2465
- // transpose weight: [1,2,3,4] -> [1,2,4,3]
2466
- int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
2467
- bcast_weight_ne[2], bcast_weight_ne[3],
2468
- bcast_weight_ne[4], bcast_weight_ne[5]};
2469
- size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
2470
- bcast_weight_nb[2], bcast_weight_nb[3],
2471
- bcast_weight_nb[4], bcast_weight_nb[5]};
 
2472
 
2473
- aclTensor* acl_weight_tensor =
2474
- ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
2475
  aclTensor* acl_input_tensor =
2476
- ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input));
2477
- aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
2478
- aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2479
 
2480
  ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2481
  ACL_CHECK(aclDestroyTensor(acl_input_tensor));
@@ -2501,46 +2599,40 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2501
  ggml_tensor* src0 = dst->src[0]; // weight
2502
  ggml_tensor* src1 = dst->src[1]; // input
2503
 
2504
- // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
2505
- // is regarded as batch. weight need transpose.
2506
- int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
 
2507
  float weight_elem_size;
2508
  if (type == GGML_TYPE_Q4_0) {
2509
  weight_elem_size = float(sizeof(uint8_t)) / 2;
2510
- }
2511
- else if (type == GGML_TYPE_Q8_0) {
2512
  weight_elem_size = float(sizeof(uint8_t));
2513
- }
2514
- else {
2515
  GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
2516
  }
2517
- float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
2518
-
2519
- // size of one matrix is element_size * height * width.
2520
- size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
2521
  size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
2522
 
2523
  // scale stored at the end of weight. Also need transpose.
2524
- GGML_ASSERT(QK4_0 == QK8_0);
2525
- int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
2526
  size_t scale_elem_size = sizeof(uint16_t);
2527
- size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
2528
- scale_elem_size};
2529
- size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
2530
  char* scale_offset = (char*)src0->data + weight_size;
2531
 
2532
  // input
2533
- void* input_buffer;
2534
  size_t input_elem_size = sizeof(uint16_t);
2535
  int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
2536
- size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
2537
- size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
2538
-
2539
  ggml_cann_pool_alloc input_alloctor(ctx.pool());
 
 
 
2540
  if (src1->type != GGML_TYPE_F16) {
2541
  aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
2542
- input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
2543
- input_buffer = input_alloctor.get();
2544
 
2545
  int64_t* input_cast_ne = src1->ne;
2546
  size_t input_cast_nb[GGML_MAX_DIMS];
@@ -2550,88 +2642,139 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2550
  }
2551
 
2552
  aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2553
- input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
2554
- input_cast_nb, GGML_MAX_DIMS);
 
2555
  aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
 
2556
  ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2557
  ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
2558
- } else {
2559
- input_buffer = src1->data;
2560
  }
2561
 
2562
  // output
2563
  size_t output_elem_size = sizeof(uint16_t);
2564
- int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
2565
- size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
2566
- ggml_cann_pool_alloc output_alloctor(
2567
- ctx.pool(), ggml_nelements(dst) * output_elem_size);
2568
- void* output_buffer = output_alloctor.get();
2569
- size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
2570
 
2571
  // aclnn
 
 
 
 
2572
  uint64_t workspaceSize = 0;
2573
- aclOpExecutor* executor;
2574
  void* workspaceAddr = nullptr;
2575
-
2576
  for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
2577
  for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
2578
  int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
2579
  int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
2580
 
2581
- int64_t batch1 = n1 * src1->ne[2] + c1;
2582
- int64_t batch0 = n0 * src0->ne[2] + c0;
2583
 
2584
  aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2585
  (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
2586
  input_elem_size, input_ne, input_nb, 2);
 
 
 
 
 
 
 
 
 
2587
  aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
2588
  (char*)src0->data + batch0 * weight_stride,
2589
- ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
2590
- weight_nb, 2);
 
2591
  aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
2592
- scale_offset + batch0 * scale_stride, ACL_FLOAT16,
2593
- scale_elem_size, scale_ne, scale_nb, 2);
 
 
2594
  aclTensor* acl_output_tensor = ggml_cann_create_tensor(
2595
- (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
2596
- output_elem_size, output_ne, output_nb, 2);
 
 
2597
 
2598
  ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
2599
- acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
2600
- nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
2601
- &workspaceSize, &executor));
2602
-
2603
- if (workspaceSize > 0 && workspaceAddr == nullptr) {
2604
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
2605
- workspaceSize);
2606
- workspaceAddr = workspace_allocator.get();
2607
  }
2608
-
2609
  ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
2610
  workspaceAddr, workspaceSize, executor, ctx.stream()));
2611
 
2612
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2613
  ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2614
  ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
2615
  ACL_CHECK(aclDestroyTensor(acl_output_tensor));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2616
  }
2617
  }
2618
 
2619
  // cast out
2620
- int64_t* output_cast_ne = dst->ne;
2621
- size_t output_cast_nb[GGML_MAX_DIMS];
2622
- output_cast_nb[0] = sizeof(uint16_t);
2623
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2624
- output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
2625
- }
 
2626
 
2627
- aclTensor* acl_output_tensor =
2628
- ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
2629
- output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
2630
- aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
2631
- aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT);
 
2632
 
2633
- ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2634
- ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
 
2635
  }
2636
 
2637
  void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
33
  #include <aclnnop/aclnn_group_norm.h>
34
  #include <aclnnop/aclnn_index_fill_tensor.h>
35
  #include <aclnnop/aclnn_layer_norm.h>
36
+ #include <aclnnop/aclnn_mm.h>
37
+ #include <aclnnop/aclnn_batch_matmul.h>
38
  #include <aclnnop/aclnn_matmul.h>
39
  #include <aclnnop/aclnn_max_pool.h>
40
  #include <aclnnop/aclnn_permute.h>
 
2425
  aclTensor* acl_weight, aclTensor* acl_dst) {
2426
  int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
2427
  // fp32, atlas a2 will transpose it to HFLOAT32.
 
2428
  uint64_t workspaceSize = 0;
2429
  aclOpExecutor* executor;
2430
  void* workspaceAddr = nullptr;
 
2442
  aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
2443
  }
2444
 
2445
+ /**
2446
+ * @brief Performs matrix multiplication of two 2D tensors.
2447
+ *
2448
+ * This function computes the matrix multiplication of the input tensor
2449
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2450
+ * destination tensor `acl_dst`.
2451
+ * The operation is defined as:
2452
+ * \f[
2453
+ * \text {acl_dst}=\text {acl_input@acl_weight}
2454
+ * \f]
2455
+ *
2456
+ * @param ctx The context for the CANN backend operations.
2457
+ * @param acl_input The input tensor for the matrix multiplication.
2458
+ * @param acl_weight The weight tensor for the matrix multiplication.
2459
+ * @param acl_dst The destination tensor where the result of the matrix
2460
+ * multiplication will be stored.
2461
+ */
2462
+ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
2463
+ aclTensor* acl_weight, aclTensor* acl_dst) {
2464
+ int8_t cube_math_type = 2;
2465
+ uint64_t workspaceSize = 0;
2466
+ aclOpExecutor* executor;
2467
+ void* workspaceAddr = nullptr;
2468
+
2469
+ ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
2470
+ cube_math_type, &workspaceSize,
2471
+ &executor));
2472
+
2473
+ if (workspaceSize > 0) {
2474
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2475
+ workspaceAddr = workspace_allocator.get();
2476
+ }
2477
+
2478
+ ACL_CHECK(
2479
+ aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
2480
+ }
2481
+
2482
+ /**
2483
+ * @brief Performs matrix multiplication of two 3D tensors.
2484
+ *
2485
+ * This function computes the matrix multiplication of the input tensor
2486
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2487
+ * destination tensor `acl_dst`.
2488
+ * The operation is defined as:
2489
+ * \f[
2490
+ * \text {acl_dst}=\text {acl_input@acl_weight}
2491
+ * \f]
2492
+ *
2493
+ * @param ctx The context for the CANN backend operations.
2494
+ * @param acl_input The input tensor for the matrix multiplication.
2495
+ * @param acl_weight The weight tensor for the matrix multiplication.
2496
+ * @param acl_dst The destination tensor where the result of the matrix
2497
+ * multiplication will be stored.
2498
+ */
2499
+ static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
2500
+ aclTensor* acl_weight, aclTensor* acl_dst) {
2501
+ int8_t cube_math_type = 2;
2502
+ uint64_t workspaceSize = 0;
2503
+ aclOpExecutor* executor;
2504
+ void* workspaceAddr = nullptr;
2505
+
2506
+ ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
2507
+ cube_math_type, &workspaceSize,
2508
+ &executor));
2509
+
2510
+ if (workspaceSize > 0) {
2511
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2512
+ workspaceAddr = workspace_allocator.get();
2513
+ }
2514
+
2515
+ ACL_CHECK(
2516
+ aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
2517
+ }
2518
+
2519
  /**
2520
  * @brief Performs matrix multiplication with floating-point precision on
2521
  * tensors using the CANN backend.
 
2537
  // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
2538
  BCAST_MUL_MAT_SHAPE(input, weight, dst);
2539
 
2540
+ int64_t n_dims = bcast_dims;
2541
+ if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
2542
+ if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
2543
+ n_dims = 2;
2544
+ } else if (bcast_input_ne[2] == 1) {
2545
+ n_dims = 3;
2546
+ }
2547
+ }
2548
 
 
 
2549
  aclTensor* acl_input_tensor =
2550
+ ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
2551
+ int64_t transpose_ne[] = {
2552
+ bcast_weight_ne[1], bcast_weight_ne[0],
2553
+ bcast_weight_ne[2], bcast_weight_ne[3],
2554
+ bcast_weight_ne[4], bcast_weight_ne[5]
2555
+ };
2556
+ size_t transpose_nb[] = {
2557
+ bcast_weight_nb[1], bcast_weight_nb[0],
2558
+ bcast_weight_nb[2], bcast_weight_nb[3],
2559
+ bcast_weight_nb[4], bcast_weight_nb[5]
2560
+ };
2561
+ aclTensor* acl_weight_tensor =
2562
+ ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
2563
+ aclTensor* acl_dst =
2564
+ ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
2565
+
2566
+ switch (n_dims) {
2567
+ case 2:
2568
+ aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2569
+ break;
2570
+ case 3:
2571
+ aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2572
+ break;
2573
+ default:
2574
+ aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2575
+ break;
2576
+ }
2577
 
2578
  ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2579
  ACL_CHECK(aclDestroyTensor(acl_input_tensor));
 
2599
  ggml_tensor* src0 = dst->src[0]; // weight
2600
  ggml_tensor* src1 = dst->src[1]; // input
2601
 
2602
+ // The shape of the weight is NCHW.
2603
+ // Matrix multiplication uses HW dims.
2604
+ // HC is regarded as batch.
2605
+ // weight need transpose.
2606
  float weight_elem_size;
2607
  if (type == GGML_TYPE_Q4_0) {
2608
  weight_elem_size = float(sizeof(uint8_t)) / 2;
2609
+ } else if (type == GGML_TYPE_Q8_0) {
 
2610
  weight_elem_size = float(sizeof(uint8_t));
2611
+ } else {
 
2612
  GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
2613
  }
2614
+ float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
2615
+ size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
 
 
2616
  size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
2617
 
2618
  // scale stored at the end of weight. Also need transpose.
 
 
2619
  size_t scale_elem_size = sizeof(uint16_t);
2620
+ size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
2621
+ size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
 
2622
  char* scale_offset = (char*)src0->data + weight_size;
2623
 
2624
  // input
 
2625
  size_t input_elem_size = sizeof(uint16_t);
2626
  int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
2627
+ size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
2628
+ size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
 
2629
  ggml_cann_pool_alloc input_alloctor(ctx.pool());
2630
+ void* input_buffer = src1->data;
2631
+
2632
+ // case in
2633
  if (src1->type != GGML_TYPE_F16) {
2634
  aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
2635
+ input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
 
2636
 
2637
  int64_t* input_cast_ne = src1->ne;
2638
  size_t input_cast_nb[GGML_MAX_DIMS];
 
2642
  }
2643
 
2644
  aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2645
+ input_buffer,
2646
+ ACL_FLOAT16,
2647
+ input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
2648
  aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
2649
+
2650
  ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2651
  ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
 
 
2652
  }
2653
 
2654
  // output
2655
  size_t output_elem_size = sizeof(uint16_t);
2656
+ size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
2657
+ ggml_cann_pool_alloc output_allocator(ctx.pool());
2658
+ void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
2659
+ size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
 
 
2660
 
2661
  // aclnn
2662
+ int64_t max_elem_size = 65535;
2663
+ int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
2664
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool());
2665
+ aclOpExecutor* executor = nullptr;
2666
  uint64_t workspaceSize = 0;
 
2667
  void* workspaceAddr = nullptr;
 
2668
  for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
2669
  for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
2670
  int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
2671
  int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
2672
 
2673
+ int64_t batch1 = (n1 * src1->ne[2]) + c1;
2674
+ int64_t batch0 = (n0 * src0->ne[2]) + c0;
2675
 
2676
  aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2677
  (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
2678
  input_elem_size, input_ne, input_nb, 2);
2679
+
2680
+ // first split
2681
+ int64_t weight_ne_offset = 0;
2682
+ int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
2683
+ int64_t scale_ne_offset = 0;
2684
+ int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
2685
+ int64_t output_ne_offset = 0;
2686
+ int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
2687
+
2688
  aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
2689
  (char*)src0->data + batch0 * weight_stride,
2690
+ ggml_cann_type_mapping(type),
2691
+ weight_elem_size, weight_ne, weight_nb, 2,
2692
+ ACL_FORMAT_ND, weight_ne_offset);
2693
  aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
2694
+ scale_offset + batch0 * scale_stride,
2695
+ ACL_FLOAT16,
2696
+ scale_elem_size, scale_ne, scale_nb, 2,
2697
+ ACL_FORMAT_ND, scale_ne_offset);
2698
  aclTensor* acl_output_tensor = ggml_cann_create_tensor(
2699
+ (char*)output_buffer + batch1 * output_stride,
2700
+ ACL_FLOAT16,
2701
+ output_elem_size, output_ne, output_nb, 2,
2702
+ ACL_FORMAT_ND, output_ne_offset);
2703
 
2704
  ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
2705
+ acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
2706
+ nullptr, nullptr, nullptr, nullptr, QK8_0,
2707
+ acl_output_tensor, &workspaceSize, &executor));
2708
+ if (workspaceAddr == nullptr) {
2709
+ workspaceAddr = workspace_allocator.alloc(workspaceSize);
 
 
 
2710
  }
 
2711
  ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
2712
  workspaceAddr, workspaceSize, executor, ctx.stream()));
2713
 
 
2714
  ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2715
  ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
2716
  ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2717
+
2718
+ // other splits
2719
+ for (int64_t split = 1; split < split_size; split++) {
2720
+ weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
2721
+ weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
2722
+ scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
2723
+ scale_ne[0] = weight_ne[0];
2724
+ output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
2725
+ output_ne[0] = weight_ne[0];
2726
+
2727
+ acl_weight_tensor = ggml_cann_create_tensor(
2728
+ (char*)src0->data + batch0 * weight_stride,
2729
+ ggml_cann_type_mapping(type),
2730
+ weight_elem_size, weight_ne, weight_nb, 2,
2731
+ ACL_FORMAT_ND, weight_ne_offset);
2732
+ acl_scale_tensor = ggml_cann_create_tensor(
2733
+ scale_offset + batch0 * scale_stride,
2734
+ ACL_FLOAT16,
2735
+ scale_elem_size, scale_ne, scale_nb, 2,
2736
+ ACL_FORMAT_ND, scale_ne_offset);
2737
+ acl_output_tensor = ggml_cann_create_tensor(
2738
+ (char*)output_buffer + batch1 * output_stride,
2739
+ ACL_FLOAT16,
2740
+ output_elem_size, output_ne, output_nb, 2,
2741
+ ACL_FORMAT_ND, output_ne_offset);
2742
+
2743
+ ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
2744
+ acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
2745
+ nullptr, nullptr, nullptr, nullptr, QK8_0,
2746
+ acl_output_tensor, &workspaceSize, &executor));
2747
+ ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
2748
+ workspaceAddr, workspaceSize, executor, ctx.stream()));
2749
+
2750
+ ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2751
+ ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
2752
+ ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2753
+ }
2754
+
2755
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2756
  }
2757
  }
2758
 
2759
  // cast out
2760
+ if (dst->type != GGML_TYPE_F16) {
2761
+ int64_t* output_cast_ne = dst->ne;
2762
+ size_t output_cast_nb[GGML_MAX_DIMS];
2763
+ output_cast_nb[0] = sizeof(uint16_t);
2764
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2765
+ output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
2766
+ }
2767
 
2768
+ aclTensor* acl_output_tensor = ggml_cann_create_tensor(
2769
+ output_buffer,
2770
+ ACL_FLOAT16,
2771
+ output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
2772
+ aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
2773
+ aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
2774
 
2775
+ ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2776
+ ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
2777
+ }
2778
  }
2779
 
2780
  void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml/src/ggml-cann/common.h CHANGED
@@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
211
  struct ggml_backend_cann_context {
212
  int32_t device; /**< Device ID. */
213
  std::string name; /**< Name of the device. */
 
214
  aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
215
 
216
- aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
217
- {nullptr}}; /**< Array of streams for the device. */
218
 
219
  /**
220
  * @brief Constructor for initializing the context with a given device.
221
  * @param device Device ID.
222
  */
223
  explicit ggml_backend_cann_context(int device)
224
- : device(device), name("CANN" + std::to_string(device)) {}
 
 
 
225
 
226
  /**
227
  * @brief Destructor for cleaning up resources.
 
211
  struct ggml_backend_cann_context {
212
  int32_t device; /**< Device ID. */
213
  std::string name; /**< Name of the device. */
214
+ std::string description; /**< Description of the device. */
215
  aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
216
 
217
+ aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
 
218
 
219
  /**
220
  * @brief Constructor for initializing the context with a given device.
221
  * @param device Device ID.
222
  */
223
  explicit ggml_backend_cann_context(int device)
224
+ : device(device), name("CANN" + std::to_string(device)) {
225
+ ggml_cann_set_device(device);
226
+ description = aclrtGetSocName();
227
+ }
228
 
229
  /**
230
  * @brief Destructor for cleaning up resources.
ggml/src/ggml-cann/ggml-cann.cpp CHANGED
@@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
122
  ACL_CHECK(aclrtMemGetAllocationGranularity(
123
  &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
124
  &info.devices[id].vmm_granularity));
 
 
 
 
125
  }
126
 
127
  // TODO: add more device info later.
@@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
208
  * @return A pointer to the allocated buffer.
209
  */
210
  void* alloc(size_t size, size_t* actual_size) override {
 
 
 
 
 
211
  #ifdef DEBUG_CANN_MALLOC
212
  int nnz = 0;
213
  size_t max_size = 0;
@@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
246
  return ptr;
247
  }
248
  void* ptr;
249
- size_t look_ahead_size = (size_t)(1.05 * size);
250
- look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
251
  ggml_cann_set_device(device);
252
  ACL_CHECK(
253
- aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
254
- *actual_size = look_ahead_size;
255
- pool_size += look_ahead_size;
256
  #ifdef DEBUG_CANN_MALLOC
257
  GGML_LOG_INFO(
258
  "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
@@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
296
  /**
297
  * @brief The maximum size of the virtual memory pool (32 GB).
298
  */
299
- static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
300
 
301
  /**
302
  * @brief The device ID associated with this buffer pool.
@@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
341
  */
342
  explicit ggml_cann_pool_vmm(int device)
343
  : device(device),
344
- granularity(ggml_cann_info().devices[device].vmm_granularity) {}
 
 
 
 
345
 
346
  /**
347
  * @brief Destructor to free all buffers in the virtual memory pool.
@@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
370
  // round up the allocation size to the alignment to ensure that all
371
  // allocations are aligned for all data types
372
  const size_t alignment = 128;
373
- size = alignment * ((size + alignment - 1) / alignment);
 
 
 
374
 
375
  size_t avail = pool_size - pool_used;
376
 
377
  if (size > avail) {
378
  // round up to the next multiple of the granularity
379
  size_t reserve_size = size - avail;
380
- reserve_size =
381
- granularity * ((reserve_size + granularity - 1) / granularity);
382
 
383
- GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
384
 
385
  // allocate more physical memory
386
  aclrtPhysicalMemProp prop = {};
@@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
396
  // reserve virtual address space (if not already reserved)
397
  if (pool_addr == 0) {
398
  ACL_CHECK(aclrtReserveMemAddress(
399
- &pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
400
  }
401
 
402
  // map at the end of the pool
@@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
409
  // add to the pool
410
  pool_size += reserve_size;
411
 
412
- // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
413
- // reserved %llu MB)\n",
414
- // device, (unsigned long long) (pool_size/1024/1024),
415
- // (unsigned long long) (reserve_size/1024/1024));
 
416
  }
417
 
418
  GGML_ASSERT(pool_addr != 0);
@@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
457
  */
458
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
459
  int device) {
460
- // return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
461
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
462
  }
463
 
@@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
1130
  static bool ggml_backend_cann_buffer_type_initialized = false;
1131
 
1132
  if (!ggml_backend_cann_buffer_type_initialized) {
1133
- for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
1134
  ggml_backend_cann_buffer_types[i] = {
1135
  /* .iface = */ ggml_backend_cann_buffer_type_interface,
1136
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
1137
  /* .context = */
1138
  new ggml_backend_cann_buffer_type_context{
1139
  i, "CANN" + std::to_string(i)},
@@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
1199
  return nullptr;
1200
  }
1201
 
 
 
 
 
 
 
1202
  void * hostPtr = nullptr;
1203
  aclError err = aclrtMallocHost((void **) &hostPtr, size);
1204
  if (err != ACL_SUCCESS) {
1205
-
1206
  GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1207
  size / 1024.0 / 1024.0, aclGetRecentErrMsg());
1208
  return nullptr;
 
122
  ACL_CHECK(aclrtMemGetAllocationGranularity(
123
  &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
124
  &info.devices[id].vmm_granularity));
125
+
126
+ size_t free, total;
127
+ ggml_backend_cann_get_device_memory(id, &free, &total);
128
+ info.devices[id].total_vram = free;
129
  }
130
 
131
  // TODO: add more device info later.
 
212
  * @return A pointer to the allocated buffer.
213
  */
214
  void* alloc(size_t size, size_t* actual_size) override {
215
+ const size_t alignment = 128;
216
+ size = GGML_PAD(size, alignment);
217
+ if (size == 0) {
218
+ size = alignment;
219
+ }
220
  #ifdef DEBUG_CANN_MALLOC
221
  int nnz = 0;
222
  size_t max_size = 0;
 
255
  return ptr;
256
  }
257
  void* ptr;
 
 
258
  ggml_cann_set_device(device);
259
  ACL_CHECK(
260
+ aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
261
+ *actual_size = size;
262
+ pool_size += size;
263
  #ifdef DEBUG_CANN_MALLOC
264
  GGML_LOG_INFO(
265
  "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
 
303
  /**
304
  * @brief The maximum size of the virtual memory pool (32 GB).
305
  */
306
+ size_t max_size;
307
 
308
  /**
309
  * @brief The device ID associated with this buffer pool.
 
348
  */
349
  explicit ggml_cann_pool_vmm(int device)
350
  : device(device),
351
+ granularity(ggml_cann_info().devices[device].vmm_granularity) {
352
+ auto dev = ggml_cann_info().devices[device];
353
+ granularity = dev.vmm_granularity;
354
+ max_size = dev.total_vram;
355
+ }
356
 
357
  /**
358
  * @brief Destructor to free all buffers in the virtual memory pool.
 
381
  // round up the allocation size to the alignment to ensure that all
382
  // allocations are aligned for all data types
383
  const size_t alignment = 128;
384
+ size = GGML_PAD(size, alignment);
385
+ if (size == 0) {
386
+ size = alignment;
387
+ }
388
 
389
  size_t avail = pool_size - pool_used;
390
 
391
  if (size > avail) {
392
  // round up to the next multiple of the granularity
393
  size_t reserve_size = size - avail;
394
+ reserve_size = GGML_PAD(reserve_size, granularity);
 
395
 
396
+ GGML_ASSERT(pool_size + reserve_size <= max_size);
397
 
398
  // allocate more physical memory
399
  aclrtPhysicalMemProp prop = {};
 
409
  // reserve virtual address space (if not already reserved)
410
  if (pool_addr == 0) {
411
  ACL_CHECK(aclrtReserveMemAddress(
412
+ &pool_addr, max_size, 0, NULL, 1));
413
  }
414
 
415
  // map at the end of the pool
 
422
  // add to the pool
423
  pool_size += reserve_size;
424
 
425
+ #ifdef DEBUG_CANN_MALLOC
426
+ GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
427
+ device, (unsigned long long) (pool_size/1024/1024),
428
+ (unsigned long long) (reserve_size/1024/1024));
429
+ #endif
430
  }
431
 
432
  GGML_ASSERT(pool_addr != 0);
 
471
  */
472
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
473
  int device) {
 
474
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
475
  }
476
 
 
1143
  static bool ggml_backend_cann_buffer_type_initialized = false;
1144
 
1145
  if (!ggml_backend_cann_buffer_type_initialized) {
1146
+ for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
1147
  ggml_backend_cann_buffer_types[i] = {
1148
  /* .iface = */ ggml_backend_cann_buffer_type_interface,
1149
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
1150
  /* .context = */
1151
  new ggml_backend_cann_buffer_type_context{
1152
  i, "CANN" + std::to_string(i)},
 
1212
  return nullptr;
1213
  }
1214
 
1215
+ const size_t alignment = 128;
1216
+ size = GGML_PAD(size, alignment);
1217
+ if (size == 0) {
1218
+ size = alignment;
1219
+ }
1220
+
1221
  void * hostPtr = nullptr;
1222
  aclError err = aclrtMallocHost((void **) &hostPtr, size);
1223
  if (err != ACL_SUCCESS) {
 
1224
  GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1225
  size / 1024.0 / 1024.0, aclGetRecentErrMsg());
1226
  return nullptr;