Spaces:
Sleeping
Sleeping
Commit
·
b357ea7
1
Parent(s):
6a4b6ae
CANN: RoPE and CANCAT operator optimization (llama/10488)
Browse files- ggml/src/ggml-cann/aclnn_ops.cpp +47 -178
- ggml/src/ggml-cann/ggml-cann.cpp +59 -9
ggml/src/ggml-cann/aclnn_ops.cpp
CHANGED
|
@@ -21,6 +21,7 @@
|
|
| 21 |
*/
|
| 22 |
|
| 23 |
#include "aclnn_ops.h"
|
|
|
|
| 24 |
|
| 25 |
#include <aclnnop/aclnn_avgpool2d.h>
|
| 26 |
#include <aclnnop/aclnn_cast.h>
|
|
@@ -241,10 +242,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
| 241 |
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
| 242 |
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
| 243 |
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
aclTensor* tensors[] = {acl_src0, acl_src1};
|
| 246 |
aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
|
| 247 |
-
aclnn_concat(ctx, tensorList, acl_dst,
|
| 248 |
|
| 249 |
ACL_CHECK(aclDestroyTensorList(tensorList));
|
| 250 |
ACL_CHECK(aclDestroyTensor(acl_dst));
|
|
@@ -1437,10 +1442,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
| 1437 |
ggml_tensor* src0 = dst->src[0]; // kernel
|
| 1438 |
ggml_tensor* src1 = dst->src[1]; // input
|
| 1439 |
|
| 1440 |
-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
| 1441 |
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 1442 |
-
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
| 1443 |
-
|
| 1444 |
GGML_TENSOR_BINARY_OP_LOCALS;
|
| 1445 |
|
| 1446 |
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
|
|
@@ -1462,9 +1463,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
| 1462 |
const int64_t OH = is_2D ? ne2 : 1;
|
| 1463 |
const int64_t OW = ne1;
|
| 1464 |
|
| 1465 |
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
| 1466 |
-
GGML_ASSERT(nb10 == sizeof(float));
|
| 1467 |
-
|
| 1468 |
// memory allocated increased to 3x when is_2D == false
|
| 1469 |
const int64_t n_bytes_factor = is_2D ? 1 : 3;
|
| 1470 |
|
|
@@ -2859,15 +2857,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
|
| 2859 |
ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
|
| 2860 |
}
|
| 2861 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2862 |
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
| 2863 |
// TODO: use ascendc
|
| 2864 |
// Only test with LLAMA model.
|
| 2865 |
ggml_tensor* src0 = dst->src[0]; // input
|
| 2866 |
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
| 2867 |
|
| 2868 |
-
// TODO: with freq_factors
|
| 2869 |
-
GGML_ASSERT(src2 == NULL);
|
| 2870 |
-
|
| 2871 |
// param
|
| 2872 |
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
| 2873 |
// const int n_past = ((int32_t *) dst->op_params)[0];
|
|
@@ -2885,13 +2895,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
| 2885 |
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
|
| 2886 |
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
|
| 2887 |
|
| 2888 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2889 |
GGML_ASSERT(n_dims % 2 == 0);
|
| 2890 |
-
|
| 2891 |
// TODO: ext_factor != 0
|
| 2892 |
GGML_ASSERT(ext_factor == 0);
|
| 2893 |
// TODO: freq_scale != 1
|
| 2894 |
GGML_ASSERT(freq_scale == 1);
|
|
|
|
|
|
|
| 2895 |
|
| 2896 |
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
| 2897 |
|
|
@@ -2924,177 +2940,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
| 2924 |
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
|
| 2925 |
theta_scale, is_neox);
|
| 2926 |
|
| 2927 |
-
|
| 2928 |
-
|
| 2929 |
-
aclTensor* acl_minus_one_tensor;
|
| 2930 |
-
void* minus_one_scale_buffer = nullptr;
|
| 2931 |
-
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
|
| 2932 |
-
ggml_cann_pool_alloc minus_one_scale_allocator(
|
| 2933 |
-
ctx.pool(), sizeof(float_t) * src0->ne[0]);
|
| 2934 |
-
if (!is_neox) {
|
| 2935 |
-
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
|
| 2936 |
-
input_roll_buffer = roll_allocator.get();
|
| 2937 |
-
int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
|
| 2938 |
-
src0->ne[2], src0->ne[3]};
|
| 2939 |
-
size_t input_roll_nb[GGML_MAX_DIMS];
|
| 2940 |
-
input_roll_nb[0] = ggml_type_size(src0->type);
|
| 2941 |
-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 2942 |
-
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
|
| 2943 |
-
}
|
| 2944 |
-
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
|
| 2945 |
-
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
| 2946 |
-
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
|
| 2947 |
-
GGML_MAX_DIMS);
|
| 2948 |
-
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
| 2949 |
-
src0->data, ggml_cann_type_mapping(src0->type),
|
| 2950 |
-
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
|
| 2951 |
-
GGML_MAX_DIMS);
|
| 2952 |
-
|
| 2953 |
-
int64_t shifts[] = {1};
|
| 2954 |
-
int64_t dims[] = {3};
|
| 2955 |
-
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
| 2956 |
-
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
|
| 2957 |
-
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
| 2958 |
-
|
| 2959 |
-
// init [-1, 1, -1, 1, ...]
|
| 2960 |
-
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
| 2961 |
-
|
| 2962 |
-
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
| 2963 |
-
size_t minus_one_nb[GGML_MAX_DIMS];
|
| 2964 |
-
minus_one_nb[0] = sizeof(float_t);
|
| 2965 |
-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 2966 |
-
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
| 2967 |
-
}
|
| 2968 |
-
acl_minus_one_tensor = aclnn_ones(
|
| 2969 |
-
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
| 2970 |
-
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
| 2971 |
-
int64_t dim = 3;
|
| 2972 |
-
int64_t* index = new int64_t[src0->ne[0]];
|
| 2973 |
-
for (int i = 0; i < src0->ne[0]; i++) {
|
| 2974 |
-
index[i] = i / 2 * 2;
|
| 2975 |
-
}
|
| 2976 |
-
int64_t index_num = src0->ne[0];
|
| 2977 |
-
float value = -1;
|
| 2978 |
-
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
|
| 2979 |
-
index_num, value);
|
| 2980 |
-
} else {
|
| 2981 |
-
// roll input: [q0,q1,q2,...] ->
|
| 2982 |
-
// [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
|
| 2983 |
-
input_roll_buffer = roll_allocator.get();
|
| 2984 |
-
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
|
| 2985 |
-
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
| 2986 |
-
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
|
| 2987 |
-
aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
|
| 2988 |
-
|
| 2989 |
-
int64_t shifts[] = {src0->ne[0] / 2};
|
| 2990 |
-
int64_t dims[] = {3};
|
| 2991 |
-
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
| 2992 |
-
|
| 2993 |
-
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
|
| 2994 |
-
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
| 2995 |
|
| 2996 |
-
|
| 2997 |
-
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
| 2998 |
|
| 2999 |
-
|
| 3000 |
-
|
| 3001 |
-
|
| 3002 |
-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 3003 |
-
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
| 3004 |
-
}
|
| 3005 |
-
acl_minus_one_tensor = aclnn_ones(
|
| 3006 |
-
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
| 3007 |
-
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
| 3008 |
-
// -1 * first half
|
| 3009 |
-
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
|
| 3010 |
-
size_t first_half_nb[GGML_MAX_DIMS];
|
| 3011 |
-
first_half_nb[0] = sizeof(float_t);
|
| 3012 |
-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 3013 |
-
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
|
| 3014 |
-
}
|
| 3015 |
-
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
|
| 3016 |
-
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
|
| 3017 |
-
first_half_nb, GGML_MAX_DIMS);
|
| 3018 |
-
bool inplace = true;
|
| 3019 |
-
float scale = -1;
|
| 3020 |
-
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
|
| 3021 |
-
ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
|
| 3022 |
-
}
|
| 3023 |
-
|
| 3024 |
-
// TODO: n_dims < ne0
|
| 3025 |
-
GGML_ASSERT(n_dims == src0->ne[0]);
|
| 3026 |
-
|
| 3027 |
-
// input * scale
|
| 3028 |
-
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
|
| 3029 |
-
ggml_nbytes(src0));
|
| 3030 |
-
void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
|
| 3031 |
-
size_t input_nb[GGML_MAX_DIMS];
|
| 3032 |
-
input_nb[0] = ggml_type_size(src0->type);
|
| 3033 |
-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 3034 |
-
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
|
| 3035 |
}
|
| 3036 |
-
aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
|
| 3037 |
-
input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
|
| 3038 |
-
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
| 3039 |
-
aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
|
| 3040 |
-
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
| 3041 |
-
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
| 3042 |
|
| 3043 |
-
|
| 3044 |
-
acl_input_roll_mul_scale_tensor);
|
| 3045 |
-
|
| 3046 |
-
// output
|
| 3047 |
-
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
| 3048 |
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
| 3049 |
-
|
| 3050 |
-
|
| 3051 |
-
|
| 3052 |
-
|
| 3053 |
-
|
| 3054 |
-
aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
|
| 3055 |
-
// TODO: ne0 != n_dims in mode2
|
| 3056 |
-
} else if (src0->type == GGML_TYPE_F16) {
|
| 3057 |
-
size_t input_fp32_nb[GGML_MAX_DIMS];
|
| 3058 |
-
input_fp32_nb[0] = sizeof(float_t);
|
| 3059 |
-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 3060 |
-
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
|
| 3061 |
-
}
|
| 3062 |
-
ggml_cann_pool_alloc fp32_allocator1(
|
| 3063 |
-
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
| 3064 |
-
void* input_fp32_buffer1 = fp32_allocator1.get();
|
| 3065 |
-
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
|
| 3066 |
-
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
|
| 3067 |
-
input_fp32_nb, GGML_MAX_DIMS);
|
| 3068 |
-
ggml_cann_pool_alloc fp32_allocator2(
|
| 3069 |
-
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
| 3070 |
-
void* input_fp32_buffer2 = fp32_allocator2.get();
|
| 3071 |
-
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
|
| 3072 |
-
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
|
| 3073 |
-
input_fp32_nb, GGML_MAX_DIMS);
|
| 3074 |
-
|
| 3075 |
-
ggml_cann_pool_alloc fp32_allocator(
|
| 3076 |
-
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
| 3077 |
-
output_fp32_buffer = fp32_allocator.get();
|
| 3078 |
-
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
|
| 3079 |
-
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
|
| 3080 |
-
input_fp32_nb, GGML_MAX_DIMS);
|
| 3081 |
-
aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
|
| 3082 |
-
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
|
| 3083 |
-
input_fp32_tensor2);
|
| 3084 |
-
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
|
| 3085 |
-
output_fp32_tensor);
|
| 3086 |
-
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
|
| 3087 |
-
|
| 3088 |
-
ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
|
| 3089 |
-
ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
|
| 3090 |
-
ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
|
| 3091 |
}
|
| 3092 |
|
| 3093 |
-
ACL_CHECK(
|
|
|
|
|
|
|
|
|
|
| 3094 |
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
|
| 3095 |
-
ACL_CHECK(aclDestroyTensor(
|
| 3096 |
-
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
|
| 3097 |
-
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
|
| 3098 |
-
ACL_CHECK(aclDestroyTensor(acl_src0));
|
| 3099 |
ACL_CHECK(aclDestroyTensor(acl_dst));
|
| 3100 |
}
|
|
|
|
| 21 |
*/
|
| 22 |
|
| 23 |
#include "aclnn_ops.h"
|
| 24 |
+
#include "ggml-impl.h"
|
| 25 |
|
| 26 |
#include <aclnnop/aclnn_avgpool2d.h>
|
| 27 |
#include <aclnnop/aclnn_cast.h>
|
|
|
|
| 242 |
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
| 243 |
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
| 244 |
|
| 245 |
+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
| 246 |
+
|
| 247 |
+
GGML_ASSERT(dim >= 0 && dim < 4);
|
| 248 |
+
int32_t acl_dim = 3 - dim;
|
| 249 |
+
|
| 250 |
aclTensor* tensors[] = {acl_src0, acl_src1};
|
| 251 |
aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
|
| 252 |
+
aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
|
| 253 |
|
| 254 |
ACL_CHECK(aclDestroyTensorList(tensorList));
|
| 255 |
ACL_CHECK(aclDestroyTensor(acl_dst));
|
|
|
|
| 1442 |
ggml_tensor* src0 = dst->src[0]; // kernel
|
| 1443 |
ggml_tensor* src1 = dst->src[1]; // input
|
| 1444 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1445 |
GGML_TENSOR_BINARY_OP_LOCALS;
|
| 1446 |
|
| 1447 |
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
|
|
|
|
| 1463 |
const int64_t OH = is_2D ? ne2 : 1;
|
| 1464 |
const int64_t OW = ne1;
|
| 1465 |
|
|
|
|
|
|
|
|
|
|
| 1466 |
// memory allocated increased to 3x when is_2D == false
|
| 1467 |
const int64_t n_bytes_factor = is_2D ? 1 : 3;
|
| 1468 |
|
|
|
|
| 2857 |
ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
|
| 2858 |
}
|
| 2859 |
|
| 2860 |
+
#ifdef __cplusplus
|
| 2861 |
+
extern "C" {
|
| 2862 |
+
#endif
|
| 2863 |
+
aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
| 2864 |
+
const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
|
| 2865 |
+
int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
|
| 2866 |
+
aclOpExecutor** executor);
|
| 2867 |
+
aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
|
| 2868 |
+
uint64_t workspaceSize,
|
| 2869 |
+
aclOpExecutor* executor,
|
| 2870 |
+
aclrtStream stream);
|
| 2871 |
+
#ifdef __cplusplus
|
| 2872 |
+
}
|
| 2873 |
+
#endif
|
| 2874 |
+
|
| 2875 |
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
| 2876 |
// TODO: use ascendc
|
| 2877 |
// Only test with LLAMA model.
|
| 2878 |
ggml_tensor* src0 = dst->src[0]; // input
|
| 2879 |
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
| 2880 |
|
|
|
|
|
|
|
|
|
|
| 2881 |
// param
|
| 2882 |
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
| 2883 |
// const int n_past = ((int32_t *) dst->op_params)[0];
|
|
|
|
| 2895 |
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
|
| 2896 |
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
|
| 2897 |
|
| 2898 |
+
// TODO: with freq_factors
|
| 2899 |
+
GGML_ASSERT(src2 == NULL);
|
| 2900 |
+
// TODO: attn_factor != 1
|
| 2901 |
+
GGML_ASSERT(attn_factor == 1);
|
| 2902 |
+
// TODO: n_dims <= ne0
|
| 2903 |
+
GGML_ASSERT(n_dims == ne0);
|
| 2904 |
GGML_ASSERT(n_dims % 2 == 0);
|
|
|
|
| 2905 |
// TODO: ext_factor != 0
|
| 2906 |
GGML_ASSERT(ext_factor == 0);
|
| 2907 |
// TODO: freq_scale != 1
|
| 2908 |
GGML_ASSERT(freq_scale == 1);
|
| 2909 |
+
// TODO: type == GGML_TYPE_F16
|
| 2910 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 2911 |
|
| 2912 |
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
| 2913 |
|
|
|
|
| 2940 |
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
|
| 2941 |
theta_scale, is_neox);
|
| 2942 |
|
| 2943 |
+
uint64_t workspaceSize = 0;
|
| 2944 |
+
aclOpExecutor* executor;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2945 |
|
| 2946 |
+
void* workspaceAddr = nullptr;
|
|
|
|
| 2947 |
|
| 2948 |
+
int acl_mode = mode;
|
| 2949 |
+
if (mode == 0) {
|
| 2950 |
+
acl_mode = 1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2951 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2952 |
|
| 2953 |
+
aclTensor* acl_x = ggml_cann_create_tensor(src0);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2954 |
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
| 2955 |
+
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
| 2956 |
+
acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
|
| 2957 |
+
if (workspaceSize > 0) {
|
| 2958 |
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
| 2959 |
+
workspaceAddr = workspace_allocator.get();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2960 |
}
|
| 2961 |
|
| 2962 |
+
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
|
| 2963 |
+
executor, ctx.stream()));
|
| 2964 |
+
|
| 2965 |
+
ACL_CHECK(aclDestroyTensor(acl_x));
|
| 2966 |
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
|
| 2967 |
+
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
|
|
|
|
|
|
|
|
|
| 2968 |
ACL_CHECK(aclDestroyTensor(acl_dst));
|
| 2969 |
}
|
ggml/src/ggml-cann/ggml-cann.cpp
CHANGED
|
@@ -1669,12 +1669,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
| 1669 |
}
|
| 1670 |
case GGML_OP_MUL_MAT: {
|
| 1671 |
switch (op->src[0]->type) {
|
| 1672 |
-
case GGML_TYPE_F16:
|
| 1673 |
-
case GGML_TYPE_F32:
|
| 1674 |
case GGML_TYPE_Q8_0:
|
| 1675 |
-
// TODO: fix me
|
| 1676 |
// Current groupsize should not be greater than k-1 in
|
| 1677 |
-
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1678 |
case GGML_TYPE_Q4_0:
|
| 1679 |
return true;
|
| 1680 |
default:
|
|
@@ -1706,9 +1708,61 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
| 1706 |
return false;
|
| 1707 |
}
|
| 1708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1709 |
case GGML_OP_DUP:
|
| 1710 |
case GGML_OP_REPEAT:
|
| 1711 |
-
case GGML_OP_CONCAT:
|
| 1712 |
case GGML_OP_NONE:
|
| 1713 |
case GGML_OP_RESHAPE:
|
| 1714 |
case GGML_OP_VIEW:
|
|
@@ -1722,17 +1776,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
| 1722 |
case GGML_OP_SCALE:
|
| 1723 |
case GGML_OP_SQR:
|
| 1724 |
case GGML_OP_CLAMP:
|
| 1725 |
-
case GGML_OP_CONT:
|
| 1726 |
case GGML_OP_DIAG_MASK_INF:
|
| 1727 |
case GGML_OP_SOFT_MAX:
|
| 1728 |
-
case GGML_OP_ROPE:
|
| 1729 |
-
case GGML_OP_IM2COL:
|
| 1730 |
case GGML_OP_POOL_2D:
|
| 1731 |
case GGML_OP_SUM_ROWS:
|
| 1732 |
case GGML_OP_ARGSORT:
|
| 1733 |
case GGML_OP_ACC:
|
| 1734 |
case GGML_OP_GROUP_NORM:
|
| 1735 |
-
case GGML_OP_UPSCALE:
|
| 1736 |
case GGML_OP_PAD:
|
| 1737 |
case GGML_OP_ARANGE:
|
| 1738 |
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
|
|
| 1669 |
}
|
| 1670 |
case GGML_OP_MUL_MAT: {
|
| 1671 |
switch (op->src[0]->type) {
|
|
|
|
|
|
|
| 1672 |
case GGML_TYPE_Q8_0:
|
|
|
|
| 1673 |
// Current groupsize should not be greater than k-1 in
|
| 1674 |
+
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
|
| 1675 |
+
if (op->src[0]->ne[0] <= QK8_0) {
|
| 1676 |
+
return false;
|
| 1677 |
+
}
|
| 1678 |
+
case GGML_TYPE_F16:
|
| 1679 |
+
case GGML_TYPE_F32:
|
| 1680 |
case GGML_TYPE_Q4_0:
|
| 1681 |
return true;
|
| 1682 |
default:
|
|
|
|
| 1708 |
return false;
|
| 1709 |
}
|
| 1710 |
}
|
| 1711 |
+
case GGML_OP_CONT: {
|
| 1712 |
+
// TODO: support GGML_TYPE_BF16
|
| 1713 |
+
switch (op->src[0]->type) {
|
| 1714 |
+
case GGML_TYPE_F32:
|
| 1715 |
+
case GGML_TYPE_F16:
|
| 1716 |
+
return true;
|
| 1717 |
+
default:
|
| 1718 |
+
return false;
|
| 1719 |
+
}
|
| 1720 |
+
}
|
| 1721 |
+
case GGML_OP_ROPE: {
|
| 1722 |
+
// TODO: with ops-test v == 1
|
| 1723 |
+
float * freq_scale = (float*)((int32_t*)op->op_params + 6);
|
| 1724 |
+
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
|
| 1725 |
+
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
|
| 1726 |
+
// TODO: with freq_factors
|
| 1727 |
+
if (op->src[2] != NULL) {
|
| 1728 |
+
return false;
|
| 1729 |
+
}
|
| 1730 |
+
// TODO: n_dims <= ne0
|
| 1731 |
+
if (op->src[0]->ne[0] != op->op_params[1]) {
|
| 1732 |
+
return false;
|
| 1733 |
+
}
|
| 1734 |
+
// TODO: ext_factor != 0
|
| 1735 |
+
if (*ext_factor != 0) {
|
| 1736 |
+
return false;
|
| 1737 |
+
}
|
| 1738 |
+
// TODO: freq_scale != 1
|
| 1739 |
+
if (*freq_scale != 1) {
|
| 1740 |
+
return false;
|
| 1741 |
+
}
|
| 1742 |
+
// TODO: attn_factor != 1
|
| 1743 |
+
if (*attn_factor != 1) {
|
| 1744 |
+
return false;
|
| 1745 |
+
}
|
| 1746 |
+
//TODO: type == GGML_TYPE_F16
|
| 1747 |
+
switch (op->src[0]->type) {
|
| 1748 |
+
case GGML_TYPE_F32:
|
| 1749 |
+
return true;
|
| 1750 |
+
default:
|
| 1751 |
+
return false;
|
| 1752 |
+
}
|
| 1753 |
+
}
|
| 1754 |
+
case GGML_OP_UPSCALE: {
|
| 1755 |
+
// aclnnUpsampleNearest2dGetWorkspaceSize not support
|
| 1756 |
+
// selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
|
| 1757 |
+
if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
|
| 1758 |
+
return false;
|
| 1759 |
+
}
|
| 1760 |
+
return true;
|
| 1761 |
+
}
|
| 1762 |
+
case GGML_OP_IM2COL:
|
| 1763 |
+
case GGML_OP_CONCAT:
|
| 1764 |
case GGML_OP_DUP:
|
| 1765 |
case GGML_OP_REPEAT:
|
|
|
|
| 1766 |
case GGML_OP_NONE:
|
| 1767 |
case GGML_OP_RESHAPE:
|
| 1768 |
case GGML_OP_VIEW:
|
|
|
|
| 1776 |
case GGML_OP_SCALE:
|
| 1777 |
case GGML_OP_SQR:
|
| 1778 |
case GGML_OP_CLAMP:
|
|
|
|
| 1779 |
case GGML_OP_DIAG_MASK_INF:
|
| 1780 |
case GGML_OP_SOFT_MAX:
|
|
|
|
|
|
|
| 1781 |
case GGML_OP_POOL_2D:
|
| 1782 |
case GGML_OP_SUM_ROWS:
|
| 1783 |
case GGML_OP_ARGSORT:
|
| 1784 |
case GGML_OP_ACC:
|
| 1785 |
case GGML_OP_GROUP_NORM:
|
|
|
|
| 1786 |
case GGML_OP_PAD:
|
| 1787 |
case GGML_OP_ARANGE:
|
| 1788 |
case GGML_OP_TIMESTEP_EMBEDDING:
|