Spaces:
Running
Running
Shanshan Shen
shanshan shen
Frank Mai
commited on
Commit
·
f9fd6d6
1
Parent(s):
b357ea7
CANN: Improve the Inferencing Performance for Ascend NPU Device (llama/10454)
Browse files* improve inferencing performance for ascend npu.
Co-authored-by: Frank Mai <thxCode@[email protected]>
* some modification after review
* some modifications after review
* restore some modifications
* restore some modifications
---------
Co-authored-by: shanshan shen <[email protected]>
Co-authored-by: Frank Mai <thxCode@[email protected]>
- ggml/src/ggml-cann/aclnn_ops.cpp +221 -78
- ggml/src/ggml-cann/common.h +6 -3
- ggml/src/ggml-cann/ggml-cann.cpp +38 -20
ggml/src/ggml-cann/aclnn_ops.cpp
CHANGED
|
@@ -33,6 +33,8 @@
|
|
| 33 |
#include <aclnnop/aclnn_group_norm.h>
|
| 34 |
#include <aclnnop/aclnn_index_fill_tensor.h>
|
| 35 |
#include <aclnnop/aclnn_layer_norm.h>
|
|
|
|
|
|
|
| 36 |
#include <aclnnop/aclnn_matmul.h>
|
| 37 |
#include <aclnnop/aclnn_max_pool.h>
|
| 38 |
#include <aclnnop/aclnn_permute.h>
|
|
@@ -2423,7 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
|
| 2423 |
aclTensor* acl_weight, aclTensor* acl_dst) {
|
| 2424 |
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
|
| 2425 |
// fp32, atlas a2 will transpose it to HFLOAT32.
|
| 2426 |
-
|
| 2427 |
uint64_t workspaceSize = 0;
|
| 2428 |
aclOpExecutor* executor;
|
| 2429 |
void* workspaceAddr = nullptr;
|
|
@@ -2441,6 +2442,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
|
| 2441 |
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
| 2442 |
}
|
| 2443 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2444 |
/**
|
| 2445 |
* @brief Performs matrix multiplication with floating-point precision on
|
| 2446 |
* tensors using the CANN backend.
|
|
@@ -2462,20 +2537,43 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|
| 2462 |
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
|
| 2463 |
BCAST_MUL_MAT_SHAPE(input, weight, dst);
|
| 2464 |
|
| 2465 |
-
|
| 2466 |
-
|
| 2467 |
-
|
| 2468 |
-
|
| 2469 |
-
|
| 2470 |
-
|
| 2471 |
-
|
|
|
|
| 2472 |
|
| 2473 |
-
aclTensor* acl_weight_tensor =
|
| 2474 |
-
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
|
| 2475 |
aclTensor* acl_input_tensor =
|
| 2476 |
-
ggml_cann_create_tensor(input,
|
| 2477 |
-
|
| 2478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2479 |
|
| 2480 |
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
| 2481 |
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
|
@@ -2501,46 +2599,40 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|
| 2501 |
ggml_tensor* src0 = dst->src[0]; // weight
|
| 2502 |
ggml_tensor* src1 = dst->src[1]; // input
|
| 2503 |
|
| 2504 |
-
// The shape of the weight is NCHW.
|
| 2505 |
-
//
|
| 2506 |
-
|
|
|
|
| 2507 |
float weight_elem_size;
|
| 2508 |
if (type == GGML_TYPE_Q4_0) {
|
| 2509 |
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
| 2510 |
-
}
|
| 2511 |
-
else if (type == GGML_TYPE_Q8_0) {
|
| 2512 |
weight_elem_size = float(sizeof(uint8_t));
|
| 2513 |
-
}
|
| 2514 |
-
else {
|
| 2515 |
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
|
| 2516 |
}
|
| 2517 |
-
float weight_nb[] = {
|
| 2518 |
-
|
| 2519 |
-
// size of one matrix is element_size * height * width.
|
| 2520 |
-
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
|
| 2521 |
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
| 2522 |
|
| 2523 |
// scale stored at the end of weight. Also need transpose.
|
| 2524 |
-
GGML_ASSERT(QK4_0 == QK8_0);
|
| 2525 |
-
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
|
| 2526 |
size_t scale_elem_size = sizeof(uint16_t);
|
| 2527 |
-
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
|
| 2528 |
-
|
| 2529 |
-
size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
|
| 2530 |
char* scale_offset = (char*)src0->data + weight_size;
|
| 2531 |
|
| 2532 |
// input
|
| 2533 |
-
void* input_buffer;
|
| 2534 |
size_t input_elem_size = sizeof(uint16_t);
|
| 2535 |
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
|
| 2536 |
-
size_t input_nb[] = {input_elem_size,
|
| 2537 |
-
size_t input_stride =
|
| 2538 |
-
|
| 2539 |
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
|
|
|
|
|
|
|
|
|
| 2540 |
if (src1->type != GGML_TYPE_F16) {
|
| 2541 |
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
| 2542 |
-
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
| 2543 |
-
input_buffer = input_alloctor.get();
|
| 2544 |
|
| 2545 |
int64_t* input_cast_ne = src1->ne;
|
| 2546 |
size_t input_cast_nb[GGML_MAX_DIMS];
|
|
@@ -2550,88 +2642,139 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|
| 2550 |
}
|
| 2551 |
|
| 2552 |
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
| 2553 |
-
input_buffer,
|
| 2554 |
-
|
|
|
|
| 2555 |
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
|
|
|
|
| 2556 |
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
| 2557 |
ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
|
| 2558 |
-
} else {
|
| 2559 |
-
input_buffer = src1->data;
|
| 2560 |
}
|
| 2561 |
|
| 2562 |
// output
|
| 2563 |
size_t output_elem_size = sizeof(uint16_t);
|
| 2564 |
-
|
| 2565 |
-
|
| 2566 |
-
|
| 2567 |
-
|
| 2568 |
-
void* output_buffer = output_alloctor.get();
|
| 2569 |
-
size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
|
| 2570 |
|
| 2571 |
// aclnn
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2572 |
uint64_t workspaceSize = 0;
|
| 2573 |
-
aclOpExecutor* executor;
|
| 2574 |
void* workspaceAddr = nullptr;
|
| 2575 |
-
|
| 2576 |
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
|
| 2577 |
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
|
| 2578 |
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
|
| 2579 |
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
|
| 2580 |
|
| 2581 |
-
int64_t batch1 = n1 * src1->ne[2] + c1;
|
| 2582 |
-
int64_t batch0 = n0 * src0->ne[2] + c0;
|
| 2583 |
|
| 2584 |
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
| 2585 |
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
|
| 2586 |
input_elem_size, input_ne, input_nb, 2);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2587 |
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
| 2588 |
(char*)src0->data + batch0 * weight_stride,
|
| 2589 |
-
ggml_cann_type_mapping(type),
|
| 2590 |
-
weight_nb, 2
|
|
|
|
| 2591 |
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
| 2592 |
-
scale_offset + batch0 * scale_stride,
|
| 2593 |
-
|
|
|
|
|
|
|
| 2594 |
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
| 2595 |
-
(char*)output_buffer + batch1 * output_stride,
|
| 2596 |
-
|
|
|
|
|
|
|
| 2597 |
|
| 2598 |
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
| 2599 |
-
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
| 2600 |
-
nullptr, nullptr, nullptr,
|
| 2601 |
-
&workspaceSize, &executor));
|
| 2602 |
-
|
| 2603 |
-
|
| 2604 |
-
ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
|
| 2605 |
-
workspaceSize);
|
| 2606 |
-
workspaceAddr = workspace_allocator.get();
|
| 2607 |
}
|
| 2608 |
-
|
| 2609 |
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
| 2610 |
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
| 2611 |
|
| 2612 |
-
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
| 2613 |
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
| 2614 |
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
| 2615 |
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2616 |
}
|
| 2617 |
}
|
| 2618 |
|
| 2619 |
// cast out
|
| 2620 |
-
|
| 2621 |
-
|
| 2622 |
-
|
| 2623 |
-
|
| 2624 |
-
|
| 2625 |
-
|
|
|
|
| 2626 |
|
| 2627 |
-
|
| 2628 |
-
|
| 2629 |
-
|
| 2630 |
-
|
| 2631 |
-
|
|
|
|
| 2632 |
|
| 2633 |
-
|
| 2634 |
-
|
|
|
|
| 2635 |
}
|
| 2636 |
|
| 2637 |
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
|
| 33 |
#include <aclnnop/aclnn_group_norm.h>
|
| 34 |
#include <aclnnop/aclnn_index_fill_tensor.h>
|
| 35 |
#include <aclnnop/aclnn_layer_norm.h>
|
| 36 |
+
#include <aclnnop/aclnn_mm.h>
|
| 37 |
+
#include <aclnnop/aclnn_batch_matmul.h>
|
| 38 |
#include <aclnnop/aclnn_matmul.h>
|
| 39 |
#include <aclnnop/aclnn_max_pool.h>
|
| 40 |
#include <aclnnop/aclnn_permute.h>
|
|
|
|
| 2425 |
aclTensor* acl_weight, aclTensor* acl_dst) {
|
| 2426 |
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
|
| 2427 |
// fp32, atlas a2 will transpose it to HFLOAT32.
|
|
|
|
| 2428 |
uint64_t workspaceSize = 0;
|
| 2429 |
aclOpExecutor* executor;
|
| 2430 |
void* workspaceAddr = nullptr;
|
|
|
|
| 2442 |
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
| 2443 |
}
|
| 2444 |
|
| 2445 |
+
/**
|
| 2446 |
+
* @brief Performs matrix multiplication of two 2D tensors.
|
| 2447 |
+
*
|
| 2448 |
+
* This function computes the matrix multiplication of the input tensor
|
| 2449 |
+
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
| 2450 |
+
* destination tensor `acl_dst`.
|
| 2451 |
+
* The operation is defined as:
|
| 2452 |
+
* \f[
|
| 2453 |
+
* \text {acl_dst}=\text {acl_input@acl_weight}
|
| 2454 |
+
* \f]
|
| 2455 |
+
*
|
| 2456 |
+
* @param ctx The context for the CANN backend operations.
|
| 2457 |
+
* @param acl_input The input tensor for the matrix multiplication.
|
| 2458 |
+
* @param acl_weight The weight tensor for the matrix multiplication.
|
| 2459 |
+
* @param acl_dst The destination tensor where the result of the matrix
|
| 2460 |
+
* multiplication will be stored.
|
| 2461 |
+
*/
|
| 2462 |
+
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
| 2463 |
+
aclTensor* acl_weight, aclTensor* acl_dst) {
|
| 2464 |
+
int8_t cube_math_type = 2;
|
| 2465 |
+
uint64_t workspaceSize = 0;
|
| 2466 |
+
aclOpExecutor* executor;
|
| 2467 |
+
void* workspaceAddr = nullptr;
|
| 2468 |
+
|
| 2469 |
+
ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
| 2470 |
+
cube_math_type, &workspaceSize,
|
| 2471 |
+
&executor));
|
| 2472 |
+
|
| 2473 |
+
if (workspaceSize > 0) {
|
| 2474 |
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
| 2475 |
+
workspaceAddr = workspace_allocator.get();
|
| 2476 |
+
}
|
| 2477 |
+
|
| 2478 |
+
ACL_CHECK(
|
| 2479 |
+
aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
| 2480 |
+
}
|
| 2481 |
+
|
| 2482 |
+
/**
|
| 2483 |
+
* @brief Performs matrix multiplication of two 3D tensors.
|
| 2484 |
+
*
|
| 2485 |
+
* This function computes the matrix multiplication of the input tensor
|
| 2486 |
+
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
| 2487 |
+
* destination tensor `acl_dst`.
|
| 2488 |
+
* The operation is defined as:
|
| 2489 |
+
* \f[
|
| 2490 |
+
* \text {acl_dst}=\text {acl_input@acl_weight}
|
| 2491 |
+
* \f]
|
| 2492 |
+
*
|
| 2493 |
+
* @param ctx The context for the CANN backend operations.
|
| 2494 |
+
* @param acl_input The input tensor for the matrix multiplication.
|
| 2495 |
+
* @param acl_weight The weight tensor for the matrix multiplication.
|
| 2496 |
+
* @param acl_dst The destination tensor where the result of the matrix
|
| 2497 |
+
* multiplication will be stored.
|
| 2498 |
+
*/
|
| 2499 |
+
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
| 2500 |
+
aclTensor* acl_weight, aclTensor* acl_dst) {
|
| 2501 |
+
int8_t cube_math_type = 2;
|
| 2502 |
+
uint64_t workspaceSize = 0;
|
| 2503 |
+
aclOpExecutor* executor;
|
| 2504 |
+
void* workspaceAddr = nullptr;
|
| 2505 |
+
|
| 2506 |
+
ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
| 2507 |
+
cube_math_type, &workspaceSize,
|
| 2508 |
+
&executor));
|
| 2509 |
+
|
| 2510 |
+
if (workspaceSize > 0) {
|
| 2511 |
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
| 2512 |
+
workspaceAddr = workspace_allocator.get();
|
| 2513 |
+
}
|
| 2514 |
+
|
| 2515 |
+
ACL_CHECK(
|
| 2516 |
+
aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
| 2517 |
+
}
|
| 2518 |
+
|
| 2519 |
/**
|
| 2520 |
* @brief Performs matrix multiplication with floating-point precision on
|
| 2521 |
* tensors using the CANN backend.
|
|
|
|
| 2537 |
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
|
| 2538 |
BCAST_MUL_MAT_SHAPE(input, weight, dst);
|
| 2539 |
|
| 2540 |
+
int64_t n_dims = bcast_dims;
|
| 2541 |
+
if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
|
| 2542 |
+
if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
|
| 2543 |
+
n_dims = 2;
|
| 2544 |
+
} else if (bcast_input_ne[2] == 1) {
|
| 2545 |
+
n_dims = 3;
|
| 2546 |
+
}
|
| 2547 |
+
}
|
| 2548 |
|
|
|
|
|
|
|
| 2549 |
aclTensor* acl_input_tensor =
|
| 2550 |
+
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
|
| 2551 |
+
int64_t transpose_ne[] = {
|
| 2552 |
+
bcast_weight_ne[1], bcast_weight_ne[0],
|
| 2553 |
+
bcast_weight_ne[2], bcast_weight_ne[3],
|
| 2554 |
+
bcast_weight_ne[4], bcast_weight_ne[5]
|
| 2555 |
+
};
|
| 2556 |
+
size_t transpose_nb[] = {
|
| 2557 |
+
bcast_weight_nb[1], bcast_weight_nb[0],
|
| 2558 |
+
bcast_weight_nb[2], bcast_weight_nb[3],
|
| 2559 |
+
bcast_weight_nb[4], bcast_weight_nb[5]
|
| 2560 |
+
};
|
| 2561 |
+
aclTensor* acl_weight_tensor =
|
| 2562 |
+
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
|
| 2563 |
+
aclTensor* acl_dst =
|
| 2564 |
+
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
| 2565 |
+
|
| 2566 |
+
switch (n_dims) {
|
| 2567 |
+
case 2:
|
| 2568 |
+
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
| 2569 |
+
break;
|
| 2570 |
+
case 3:
|
| 2571 |
+
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
| 2572 |
+
break;
|
| 2573 |
+
default:
|
| 2574 |
+
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
| 2575 |
+
break;
|
| 2576 |
+
}
|
| 2577 |
|
| 2578 |
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
| 2579 |
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
|
|
|
| 2599 |
ggml_tensor* src0 = dst->src[0]; // weight
|
| 2600 |
ggml_tensor* src1 = dst->src[1]; // input
|
| 2601 |
|
| 2602 |
+
// The shape of the weight is NCHW.
|
| 2603 |
+
// Matrix multiplication uses HW dims.
|
| 2604 |
+
// HC is regarded as batch.
|
| 2605 |
+
// weight need transpose.
|
| 2606 |
float weight_elem_size;
|
| 2607 |
if (type == GGML_TYPE_Q4_0) {
|
| 2608 |
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
| 2609 |
+
} else if (type == GGML_TYPE_Q8_0) {
|
|
|
|
| 2610 |
weight_elem_size = float(sizeof(uint8_t));
|
| 2611 |
+
} else {
|
|
|
|
| 2612 |
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
|
| 2613 |
}
|
| 2614 |
+
float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
|
| 2615 |
+
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
|
|
|
|
|
|
|
| 2616 |
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
| 2617 |
|
| 2618 |
// scale stored at the end of weight. Also need transpose.
|
|
|
|
|
|
|
| 2619 |
size_t scale_elem_size = sizeof(uint16_t);
|
| 2620 |
+
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
|
| 2621 |
+
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
|
|
|
| 2622 |
char* scale_offset = (char*)src0->data + weight_size;
|
| 2623 |
|
| 2624 |
// input
|
|
|
|
| 2625 |
size_t input_elem_size = sizeof(uint16_t);
|
| 2626 |
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
|
| 2627 |
+
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
|
| 2628 |
+
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
|
|
|
|
| 2629 |
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
| 2630 |
+
void* input_buffer = src1->data;
|
| 2631 |
+
|
| 2632 |
+
// case in
|
| 2633 |
if (src1->type != GGML_TYPE_F16) {
|
| 2634 |
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
| 2635 |
+
input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
|
|
|
| 2636 |
|
| 2637 |
int64_t* input_cast_ne = src1->ne;
|
| 2638 |
size_t input_cast_nb[GGML_MAX_DIMS];
|
|
|
|
| 2642 |
}
|
| 2643 |
|
| 2644 |
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
| 2645 |
+
input_buffer,
|
| 2646 |
+
ACL_FLOAT16,
|
| 2647 |
+
input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
|
| 2648 |
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
|
| 2649 |
+
|
| 2650 |
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
| 2651 |
ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
|
|
|
|
|
|
|
| 2652 |
}
|
| 2653 |
|
| 2654 |
// output
|
| 2655 |
size_t output_elem_size = sizeof(uint16_t);
|
| 2656 |
+
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
|
| 2657 |
+
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
| 2658 |
+
void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
|
| 2659 |
+
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
|
|
|
|
|
|
|
| 2660 |
|
| 2661 |
// aclnn
|
| 2662 |
+
int64_t max_elem_size = 65535;
|
| 2663 |
+
int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
|
| 2664 |
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
| 2665 |
+
aclOpExecutor* executor = nullptr;
|
| 2666 |
uint64_t workspaceSize = 0;
|
|
|
|
| 2667 |
void* workspaceAddr = nullptr;
|
|
|
|
| 2668 |
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
|
| 2669 |
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
|
| 2670 |
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
|
| 2671 |
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
|
| 2672 |
|
| 2673 |
+
int64_t batch1 = (n1 * src1->ne[2]) + c1;
|
| 2674 |
+
int64_t batch0 = (n0 * src0->ne[2]) + c0;
|
| 2675 |
|
| 2676 |
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
| 2677 |
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
|
| 2678 |
input_elem_size, input_ne, input_nb, 2);
|
| 2679 |
+
|
| 2680 |
+
// first split
|
| 2681 |
+
int64_t weight_ne_offset = 0;
|
| 2682 |
+
int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
|
| 2683 |
+
int64_t scale_ne_offset = 0;
|
| 2684 |
+
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
|
| 2685 |
+
int64_t output_ne_offset = 0;
|
| 2686 |
+
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
|
| 2687 |
+
|
| 2688 |
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
| 2689 |
(char*)src0->data + batch0 * weight_stride,
|
| 2690 |
+
ggml_cann_type_mapping(type),
|
| 2691 |
+
weight_elem_size, weight_ne, weight_nb, 2,
|
| 2692 |
+
ACL_FORMAT_ND, weight_ne_offset);
|
| 2693 |
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
| 2694 |
+
scale_offset + batch0 * scale_stride,
|
| 2695 |
+
ACL_FLOAT16,
|
| 2696 |
+
scale_elem_size, scale_ne, scale_nb, 2,
|
| 2697 |
+
ACL_FORMAT_ND, scale_ne_offset);
|
| 2698 |
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
| 2699 |
+
(char*)output_buffer + batch1 * output_stride,
|
| 2700 |
+
ACL_FLOAT16,
|
| 2701 |
+
output_elem_size, output_ne, output_nb, 2,
|
| 2702 |
+
ACL_FORMAT_ND, output_ne_offset);
|
| 2703 |
|
| 2704 |
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
| 2705 |
+
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
| 2706 |
+
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
| 2707 |
+
acl_output_tensor, &workspaceSize, &executor));
|
| 2708 |
+
if (workspaceAddr == nullptr) {
|
| 2709 |
+
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
|
|
|
|
|
|
|
|
|
| 2710 |
}
|
|
|
|
| 2711 |
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
| 2712 |
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
| 2713 |
|
|
|
|
| 2714 |
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
| 2715 |
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
| 2716 |
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
| 2717 |
+
|
| 2718 |
+
// other splits
|
| 2719 |
+
for (int64_t split = 1; split < split_size; split++) {
|
| 2720 |
+
weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
|
| 2721 |
+
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
|
| 2722 |
+
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
|
| 2723 |
+
scale_ne[0] = weight_ne[0];
|
| 2724 |
+
output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
|
| 2725 |
+
output_ne[0] = weight_ne[0];
|
| 2726 |
+
|
| 2727 |
+
acl_weight_tensor = ggml_cann_create_tensor(
|
| 2728 |
+
(char*)src0->data + batch0 * weight_stride,
|
| 2729 |
+
ggml_cann_type_mapping(type),
|
| 2730 |
+
weight_elem_size, weight_ne, weight_nb, 2,
|
| 2731 |
+
ACL_FORMAT_ND, weight_ne_offset);
|
| 2732 |
+
acl_scale_tensor = ggml_cann_create_tensor(
|
| 2733 |
+
scale_offset + batch0 * scale_stride,
|
| 2734 |
+
ACL_FLOAT16,
|
| 2735 |
+
scale_elem_size, scale_ne, scale_nb, 2,
|
| 2736 |
+
ACL_FORMAT_ND, scale_ne_offset);
|
| 2737 |
+
acl_output_tensor = ggml_cann_create_tensor(
|
| 2738 |
+
(char*)output_buffer + batch1 * output_stride,
|
| 2739 |
+
ACL_FLOAT16,
|
| 2740 |
+
output_elem_size, output_ne, output_nb, 2,
|
| 2741 |
+
ACL_FORMAT_ND, output_ne_offset);
|
| 2742 |
+
|
| 2743 |
+
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
| 2744 |
+
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
| 2745 |
+
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
| 2746 |
+
acl_output_tensor, &workspaceSize, &executor));
|
| 2747 |
+
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
| 2748 |
+
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
| 2749 |
+
|
| 2750 |
+
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
| 2751 |
+
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
| 2752 |
+
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
| 2753 |
+
}
|
| 2754 |
+
|
| 2755 |
+
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
| 2756 |
}
|
| 2757 |
}
|
| 2758 |
|
| 2759 |
// cast out
|
| 2760 |
+
if (dst->type != GGML_TYPE_F16) {
|
| 2761 |
+
int64_t* output_cast_ne = dst->ne;
|
| 2762 |
+
size_t output_cast_nb[GGML_MAX_DIMS];
|
| 2763 |
+
output_cast_nb[0] = sizeof(uint16_t);
|
| 2764 |
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 2765 |
+
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
|
| 2766 |
+
}
|
| 2767 |
|
| 2768 |
+
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
| 2769 |
+
output_buffer,
|
| 2770 |
+
ACL_FLOAT16,
|
| 2771 |
+
output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
|
| 2772 |
+
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
| 2773 |
+
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
| 2774 |
|
| 2775 |
+
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
| 2776 |
+
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
| 2777 |
+
}
|
| 2778 |
}
|
| 2779 |
|
| 2780 |
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
ggml/src/ggml-cann/common.h
CHANGED
|
@@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
|
|
| 211 |
struct ggml_backend_cann_context {
|
| 212 |
int32_t device; /**< Device ID. */
|
| 213 |
std::string name; /**< Name of the device. */
|
|
|
|
| 214 |
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
| 215 |
|
| 216 |
-
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
|
| 217 |
-
{nullptr}}; /**< Array of streams for the device. */
|
| 218 |
|
| 219 |
/**
|
| 220 |
* @brief Constructor for initializing the context with a given device.
|
| 221 |
* @param device Device ID.
|
| 222 |
*/
|
| 223 |
explicit ggml_backend_cann_context(int device)
|
| 224 |
-
: device(device), name("CANN" + std::to_string(device)) {
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
/**
|
| 227 |
* @brief Destructor for cleaning up resources.
|
|
|
|
| 211 |
struct ggml_backend_cann_context {
|
| 212 |
int32_t device; /**< Device ID. */
|
| 213 |
std::string name; /**< Name of the device. */
|
| 214 |
+
std::string description; /**< Description of the device. */
|
| 215 |
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
| 216 |
|
| 217 |
+
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
|
|
|
| 218 |
|
| 219 |
/**
|
| 220 |
* @brief Constructor for initializing the context with a given device.
|
| 221 |
* @param device Device ID.
|
| 222 |
*/
|
| 223 |
explicit ggml_backend_cann_context(int device)
|
| 224 |
+
: device(device), name("CANN" + std::to_string(device)) {
|
| 225 |
+
ggml_cann_set_device(device);
|
| 226 |
+
description = aclrtGetSocName();
|
| 227 |
+
}
|
| 228 |
|
| 229 |
/**
|
| 230 |
* @brief Destructor for cleaning up resources.
|
ggml/src/ggml-cann/ggml-cann.cpp
CHANGED
|
@@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
|
|
| 122 |
ACL_CHECK(aclrtMemGetAllocationGranularity(
|
| 123 |
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
|
| 124 |
&info.devices[id].vmm_granularity));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
}
|
| 126 |
|
| 127 |
// TODO: add more device info later.
|
|
@@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
|
| 208 |
* @return A pointer to the allocated buffer.
|
| 209 |
*/
|
| 210 |
void* alloc(size_t size, size_t* actual_size) override {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
#ifdef DEBUG_CANN_MALLOC
|
| 212 |
int nnz = 0;
|
| 213 |
size_t max_size = 0;
|
|
@@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
|
| 246 |
return ptr;
|
| 247 |
}
|
| 248 |
void* ptr;
|
| 249 |
-
size_t look_ahead_size = (size_t)(1.05 * size);
|
| 250 |
-
look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
|
| 251 |
ggml_cann_set_device(device);
|
| 252 |
ACL_CHECK(
|
| 253 |
-
aclrtMalloc(&ptr,
|
| 254 |
-
*actual_size =
|
| 255 |
-
pool_size +=
|
| 256 |
#ifdef DEBUG_CANN_MALLOC
|
| 257 |
GGML_LOG_INFO(
|
| 258 |
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
|
@@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
| 296 |
/**
|
| 297 |
* @brief The maximum size of the virtual memory pool (32 GB).
|
| 298 |
*/
|
| 299 |
-
|
| 300 |
|
| 301 |
/**
|
| 302 |
* @brief The device ID associated with this buffer pool.
|
|
@@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
| 341 |
*/
|
| 342 |
explicit ggml_cann_pool_vmm(int device)
|
| 343 |
: device(device),
|
| 344 |
-
granularity(ggml_cann_info().devices[device].vmm_granularity) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
/**
|
| 347 |
* @brief Destructor to free all buffers in the virtual memory pool.
|
|
@@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
| 370 |
// round up the allocation size to the alignment to ensure that all
|
| 371 |
// allocations are aligned for all data types
|
| 372 |
const size_t alignment = 128;
|
| 373 |
-
size =
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
size_t avail = pool_size - pool_used;
|
| 376 |
|
| 377 |
if (size > avail) {
|
| 378 |
// round up to the next multiple of the granularity
|
| 379 |
size_t reserve_size = size - avail;
|
| 380 |
-
reserve_size =
|
| 381 |
-
granularity * ((reserve_size + granularity - 1) / granularity);
|
| 382 |
|
| 383 |
-
GGML_ASSERT(pool_size + reserve_size <=
|
| 384 |
|
| 385 |
// allocate more physical memory
|
| 386 |
aclrtPhysicalMemProp prop = {};
|
|
@@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
| 396 |
// reserve virtual address space (if not already reserved)
|
| 397 |
if (pool_addr == 0) {
|
| 398 |
ACL_CHECK(aclrtReserveMemAddress(
|
| 399 |
-
&pool_addr,
|
| 400 |
}
|
| 401 |
|
| 402 |
// map at the end of the pool
|
|
@@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
| 409 |
// add to the pool
|
| 410 |
pool_size += reserve_size;
|
| 411 |
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
|
|
|
| 416 |
}
|
| 417 |
|
| 418 |
GGML_ASSERT(pool_addr != 0);
|
|
@@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
| 457 |
*/
|
| 458 |
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
| 459 |
int device) {
|
| 460 |
-
// return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
|
| 461 |
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
| 462 |
}
|
| 463 |
|
|
@@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
|
| 1130 |
static bool ggml_backend_cann_buffer_type_initialized = false;
|
| 1131 |
|
| 1132 |
if (!ggml_backend_cann_buffer_type_initialized) {
|
| 1133 |
-
for (int32_t i = 0; i <
|
| 1134 |
ggml_backend_cann_buffer_types[i] = {
|
| 1135 |
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
| 1136 |
-
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(),
|
| 1137 |
/* .context = */
|
| 1138 |
new ggml_backend_cann_buffer_type_context{
|
| 1139 |
i, "CANN" + std::to_string(i)},
|
|
@@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
|
|
| 1199 |
return nullptr;
|
| 1200 |
}
|
| 1201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1202 |
void * hostPtr = nullptr;
|
| 1203 |
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
| 1204 |
if (err != ACL_SUCCESS) {
|
| 1205 |
-
|
| 1206 |
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
| 1207 |
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
| 1208 |
return nullptr;
|
|
|
|
| 122 |
ACL_CHECK(aclrtMemGetAllocationGranularity(
|
| 123 |
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
|
| 124 |
&info.devices[id].vmm_granularity));
|
| 125 |
+
|
| 126 |
+
size_t free, total;
|
| 127 |
+
ggml_backend_cann_get_device_memory(id, &free, &total);
|
| 128 |
+
info.devices[id].total_vram = free;
|
| 129 |
}
|
| 130 |
|
| 131 |
// TODO: add more device info later.
|
|
|
|
| 212 |
* @return A pointer to the allocated buffer.
|
| 213 |
*/
|
| 214 |
void* alloc(size_t size, size_t* actual_size) override {
|
| 215 |
+
const size_t alignment = 128;
|
| 216 |
+
size = GGML_PAD(size, alignment);
|
| 217 |
+
if (size == 0) {
|
| 218 |
+
size = alignment;
|
| 219 |
+
}
|
| 220 |
#ifdef DEBUG_CANN_MALLOC
|
| 221 |
int nnz = 0;
|
| 222 |
size_t max_size = 0;
|
|
|
|
| 255 |
return ptr;
|
| 256 |
}
|
| 257 |
void* ptr;
|
|
|
|
|
|
|
| 258 |
ggml_cann_set_device(device);
|
| 259 |
ACL_CHECK(
|
| 260 |
+
aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
| 261 |
+
*actual_size = size;
|
| 262 |
+
pool_size += size;
|
| 263 |
#ifdef DEBUG_CANN_MALLOC
|
| 264 |
GGML_LOG_INFO(
|
| 265 |
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
|
|
|
| 303 |
/**
|
| 304 |
* @brief The maximum size of the virtual memory pool (32 GB).
|
| 305 |
*/
|
| 306 |
+
size_t max_size;
|
| 307 |
|
| 308 |
/**
|
| 309 |
* @brief The device ID associated with this buffer pool.
|
|
|
|
| 348 |
*/
|
| 349 |
explicit ggml_cann_pool_vmm(int device)
|
| 350 |
: device(device),
|
| 351 |
+
granularity(ggml_cann_info().devices[device].vmm_granularity) {
|
| 352 |
+
auto dev = ggml_cann_info().devices[device];
|
| 353 |
+
granularity = dev.vmm_granularity;
|
| 354 |
+
max_size = dev.total_vram;
|
| 355 |
+
}
|
| 356 |
|
| 357 |
/**
|
| 358 |
* @brief Destructor to free all buffers in the virtual memory pool.
|
|
|
|
| 381 |
// round up the allocation size to the alignment to ensure that all
|
| 382 |
// allocations are aligned for all data types
|
| 383 |
const size_t alignment = 128;
|
| 384 |
+
size = GGML_PAD(size, alignment);
|
| 385 |
+
if (size == 0) {
|
| 386 |
+
size = alignment;
|
| 387 |
+
}
|
| 388 |
|
| 389 |
size_t avail = pool_size - pool_used;
|
| 390 |
|
| 391 |
if (size > avail) {
|
| 392 |
// round up to the next multiple of the granularity
|
| 393 |
size_t reserve_size = size - avail;
|
| 394 |
+
reserve_size = GGML_PAD(reserve_size, granularity);
|
|
|
|
| 395 |
|
| 396 |
+
GGML_ASSERT(pool_size + reserve_size <= max_size);
|
| 397 |
|
| 398 |
// allocate more physical memory
|
| 399 |
aclrtPhysicalMemProp prop = {};
|
|
|
|
| 409 |
// reserve virtual address space (if not already reserved)
|
| 410 |
if (pool_addr == 0) {
|
| 411 |
ACL_CHECK(aclrtReserveMemAddress(
|
| 412 |
+
&pool_addr, max_size, 0, NULL, 1));
|
| 413 |
}
|
| 414 |
|
| 415 |
// map at the end of the pool
|
|
|
|
| 422 |
// add to the pool
|
| 423 |
pool_size += reserve_size;
|
| 424 |
|
| 425 |
+
#ifdef DEBUG_CANN_MALLOC
|
| 426 |
+
GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
| 427 |
+
device, (unsigned long long) (pool_size/1024/1024),
|
| 428 |
+
(unsigned long long) (reserve_size/1024/1024));
|
| 429 |
+
#endif
|
| 430 |
}
|
| 431 |
|
| 432 |
GGML_ASSERT(pool_addr != 0);
|
|
|
|
| 471 |
*/
|
| 472 |
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
| 473 |
int device) {
|
|
|
|
| 474 |
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
| 475 |
}
|
| 476 |
|
|
|
|
| 1143 |
static bool ggml_backend_cann_buffer_type_initialized = false;
|
| 1144 |
|
| 1145 |
if (!ggml_backend_cann_buffer_type_initialized) {
|
| 1146 |
+
for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
|
| 1147 |
ggml_backend_cann_buffer_types[i] = {
|
| 1148 |
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
| 1149 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
|
| 1150 |
/* .context = */
|
| 1151 |
new ggml_backend_cann_buffer_type_context{
|
| 1152 |
i, "CANN" + std::to_string(i)},
|
|
|
|
| 1212 |
return nullptr;
|
| 1213 |
}
|
| 1214 |
|
| 1215 |
+
const size_t alignment = 128;
|
| 1216 |
+
size = GGML_PAD(size, alignment);
|
| 1217 |
+
if (size == 0) {
|
| 1218 |
+
size = alignment;
|
| 1219 |
+
}
|
| 1220 |
+
|
| 1221 |
void * hostPtr = nullptr;
|
| 1222 |
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
| 1223 |
if (err != ACL_SUCCESS) {
|
|
|
|
| 1224 |
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
| 1225 |
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
| 1226 |
return nullptr;
|