Chenguang Li noemotiovon commited on
Commit
b357ea7
·
1 Parent(s): 6a4b6ae

CANN: RoPE and CANCAT operator optimization (llama/10488)

Browse files
ggml/src/ggml-cann/aclnn_ops.cpp CHANGED
@@ -21,6 +21,7 @@
21
  */
22
 
23
  #include "aclnn_ops.h"
 
24
 
25
  #include <aclnnop/aclnn_avgpool2d.h>
26
  #include <aclnnop/aclnn_cast.h>
@@ -241,10 +242,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
241
  aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
242
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
243
 
244
- int64_t concat_dim = 1;
 
 
 
 
245
  aclTensor* tensors[] = {acl_src0, acl_src1};
246
  aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
247
- aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
248
 
249
  ACL_CHECK(aclDestroyTensorList(tensorList));
250
  ACL_CHECK(aclDestroyTensor(acl_dst));
@@ -1437,10 +1442,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1437
  ggml_tensor* src0 = dst->src[0]; // kernel
1438
  ggml_tensor* src1 = dst->src[1]; // input
1439
 
1440
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
1441
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
1442
- GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
1443
-
1444
  GGML_TENSOR_BINARY_OP_LOCALS;
1445
 
1446
  // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@@ -1462,9 +1463,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1462
  const int64_t OH = is_2D ? ne2 : 1;
1463
  const int64_t OW = ne1;
1464
 
1465
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
1466
- GGML_ASSERT(nb10 == sizeof(float));
1467
-
1468
  // memory allocated increased to 3x when is_2D == false
1469
  const int64_t n_bytes_factor = is_2D ? 1 : 3;
1470
 
@@ -2859,15 +2857,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2859
  ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
2860
  }
2861
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2862
  void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2863
  // TODO: use ascendc
2864
  // Only test with LLAMA model.
2865
  ggml_tensor* src0 = dst->src[0]; // input
2866
  ggml_tensor* src2 = dst->src[2]; // freq_factors
2867
 
2868
- // TODO: with freq_factors
2869
- GGML_ASSERT(src2 == NULL);
2870
-
2871
  // param
2872
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
2873
  // const int n_past = ((int32_t *) dst->op_params)[0];
@@ -2885,13 +2895,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2885
  memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
2886
  memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
2887
 
2888
- GGML_ASSERT(n_dims <= ne0);
 
 
 
 
 
2889
  GGML_ASSERT(n_dims % 2 == 0);
2890
-
2891
  // TODO: ext_factor != 0
2892
  GGML_ASSERT(ext_factor == 0);
2893
  // TODO: freq_scale != 1
2894
  GGML_ASSERT(freq_scale == 1);
 
 
2895
 
2896
  const float theta_scale = powf(freq_base, -2.0f / n_dims);
2897
 
@@ -2924,177 +2940,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2924
  aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
2925
  theta_scale, is_neox);
2926
 
2927
- // roll input
2928
- void* input_roll_buffer;
2929
- aclTensor* acl_minus_one_tensor;
2930
- void* minus_one_scale_buffer = nullptr;
2931
- ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
2932
- ggml_cann_pool_alloc minus_one_scale_allocator(
2933
- ctx.pool(), sizeof(float_t) * src0->ne[0]);
2934
- if (!is_neox) {
2935
- // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
2936
- input_roll_buffer = roll_allocator.get();
2937
- int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
2938
- src0->ne[2], src0->ne[3]};
2939
- size_t input_roll_nb[GGML_MAX_DIMS];
2940
- input_roll_nb[0] = ggml_type_size(src0->type);
2941
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2942
- input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
2943
- }
2944
- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
2945
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
2946
- ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
2947
- GGML_MAX_DIMS);
2948
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2949
- src0->data, ggml_cann_type_mapping(src0->type),
2950
- ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
2951
- GGML_MAX_DIMS);
2952
-
2953
- int64_t shifts[] = {1};
2954
- int64_t dims[] = {3};
2955
- aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2956
- ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
2957
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2958
-
2959
- // init [-1, 1, -1, 1, ...]
2960
- minus_one_scale_buffer = minus_one_scale_allocator.get();
2961
-
2962
- int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
2963
- size_t minus_one_nb[GGML_MAX_DIMS];
2964
- minus_one_nb[0] = sizeof(float_t);
2965
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2966
- minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
2967
- }
2968
- acl_minus_one_tensor = aclnn_ones(
2969
- ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
2970
- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
2971
- int64_t dim = 3;
2972
- int64_t* index = new int64_t[src0->ne[0]];
2973
- for (int i = 0; i < src0->ne[0]; i++) {
2974
- index[i] = i / 2 * 2;
2975
- }
2976
- int64_t index_num = src0->ne[0];
2977
- float value = -1;
2978
- aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
2979
- index_num, value);
2980
- } else {
2981
- // roll input: [q0,q1,q2,...] ->
2982
- // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
2983
- input_roll_buffer = roll_allocator.get();
2984
- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
2985
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
2986
- ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
2987
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
2988
-
2989
- int64_t shifts[] = {src0->ne[0] / 2};
2990
- int64_t dims[] = {3};
2991
- aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2992
-
2993
- ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
2994
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2995
 
2996
- // init [-1, -1, -1, 1, 1,1,...]
2997
- minus_one_scale_buffer = minus_one_scale_allocator.get();
2998
 
2999
- int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
3000
- size_t minus_one_nb[GGML_MAX_DIMS];
3001
- minus_one_nb[0] = sizeof(float_t);
3002
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
3003
- minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
3004
- }
3005
- acl_minus_one_tensor = aclnn_ones(
3006
- ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
3007
- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
3008
- // -1 * first half
3009
- int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
3010
- size_t first_half_nb[GGML_MAX_DIMS];
3011
- first_half_nb[0] = sizeof(float_t);
3012
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
3013
- first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
3014
- }
3015
- aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
3016
- minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
3017
- first_half_nb, GGML_MAX_DIMS);
3018
- bool inplace = true;
3019
- float scale = -1;
3020
- aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
3021
- ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
3022
- }
3023
-
3024
- // TODO: n_dims < ne0
3025
- GGML_ASSERT(n_dims == src0->ne[0]);
3026
-
3027
- // input * scale
3028
- ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
3029
- ggml_nbytes(src0));
3030
- void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
3031
- size_t input_nb[GGML_MAX_DIMS];
3032
- input_nb[0] = ggml_type_size(src0->type);
3033
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
3034
- input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
3035
  }
3036
- aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
3037
- input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
3038
- ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
3039
- aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
3040
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
3041
- ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
3042
 
3043
- aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
3044
- acl_input_roll_mul_scale_tensor);
3045
-
3046
- // output
3047
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
3048
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
3049
- void* output_fp32_buffer;
3050
- if (src0->type == GGML_TYPE_F32) {
3051
- aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor);
3052
- aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
3053
- acl_sin_reshape_tensor);
3054
- aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
3055
- // TODO: ne0 != n_dims in mode2
3056
- } else if (src0->type == GGML_TYPE_F16) {
3057
- size_t input_fp32_nb[GGML_MAX_DIMS];
3058
- input_fp32_nb[0] = sizeof(float_t);
3059
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
3060
- input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
3061
- }
3062
- ggml_cann_pool_alloc fp32_allocator1(
3063
- ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
3064
- void* input_fp32_buffer1 = fp32_allocator1.get();
3065
- aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
3066
- input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
3067
- input_fp32_nb, GGML_MAX_DIMS);
3068
- ggml_cann_pool_alloc fp32_allocator2(
3069
- ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
3070
- void* input_fp32_buffer2 = fp32_allocator2.get();
3071
- aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
3072
- input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
3073
- input_fp32_nb, GGML_MAX_DIMS);
3074
-
3075
- ggml_cann_pool_alloc fp32_allocator(
3076
- ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
3077
- output_fp32_buffer = fp32_allocator.get();
3078
- aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
3079
- output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
3080
- input_fp32_nb, GGML_MAX_DIMS);
3081
- aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
3082
- aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
3083
- input_fp32_tensor2);
3084
- aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
3085
- output_fp32_tensor);
3086
- aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
3087
-
3088
- ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
3089
- ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
3090
- ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
3091
  }
3092
 
3093
- ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
 
 
 
3094
  ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
3095
- ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
3096
- ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
3097
- ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
3098
- ACL_CHECK(aclDestroyTensor(acl_src0));
3099
  ACL_CHECK(aclDestroyTensor(acl_dst));
3100
  }
 
21
  */
22
 
23
  #include "aclnn_ops.h"
24
+ #include "ggml-impl.h"
25
 
26
  #include <aclnnop/aclnn_avgpool2d.h>
27
  #include <aclnnop/aclnn_cast.h>
 
242
  aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
243
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
244
 
245
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
246
+
247
+ GGML_ASSERT(dim >= 0 && dim < 4);
248
+ int32_t acl_dim = 3 - dim;
249
+
250
  aclTensor* tensors[] = {acl_src0, acl_src1};
251
  aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
252
+ aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
253
 
254
  ACL_CHECK(aclDestroyTensorList(tensorList));
255
  ACL_CHECK(aclDestroyTensor(acl_dst));
 
1442
  ggml_tensor* src0 = dst->src[0]; // kernel
1443
  ggml_tensor* src1 = dst->src[1]; // input
1444
 
 
 
 
 
1445
  GGML_TENSOR_BINARY_OP_LOCALS;
1446
 
1447
  // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
 
1463
  const int64_t OH = is_2D ? ne2 : 1;
1464
  const int64_t OW = ne1;
1465
 
 
 
 
1466
  // memory allocated increased to 3x when is_2D == false
1467
  const int64_t n_bytes_factor = is_2D ? 1 : 3;
1468
 
 
2857
  ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
2858
  }
2859
 
2860
+ #ifdef __cplusplus
2861
+ extern "C" {
2862
+ #endif
2863
+ aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
2864
+ const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
2865
+ int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
2866
+ aclOpExecutor** executor);
2867
+ aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
2868
+ uint64_t workspaceSize,
2869
+ aclOpExecutor* executor,
2870
+ aclrtStream stream);
2871
+ #ifdef __cplusplus
2872
+ }
2873
+ #endif
2874
+
2875
  void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2876
  // TODO: use ascendc
2877
  // Only test with LLAMA model.
2878
  ggml_tensor* src0 = dst->src[0]; // input
2879
  ggml_tensor* src2 = dst->src[2]; // freq_factors
2880
 
 
 
 
2881
  // param
2882
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
2883
  // const int n_past = ((int32_t *) dst->op_params)[0];
 
2895
  memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
2896
  memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
2897
 
2898
+ // TODO: with freq_factors
2899
+ GGML_ASSERT(src2 == NULL);
2900
+ // TODO: attn_factor != 1
2901
+ GGML_ASSERT(attn_factor == 1);
2902
+ // TODO: n_dims <= ne0
2903
+ GGML_ASSERT(n_dims == ne0);
2904
  GGML_ASSERT(n_dims % 2 == 0);
 
2905
  // TODO: ext_factor != 0
2906
  GGML_ASSERT(ext_factor == 0);
2907
  // TODO: freq_scale != 1
2908
  GGML_ASSERT(freq_scale == 1);
2909
+ // TODO: type == GGML_TYPE_F16
2910
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
2911
 
2912
  const float theta_scale = powf(freq_base, -2.0f / n_dims);
2913
 
 
2940
  aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
2941
  theta_scale, is_neox);
2942
 
2943
+ uint64_t workspaceSize = 0;
2944
+ aclOpExecutor* executor;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2945
 
2946
+ void* workspaceAddr = nullptr;
 
2947
 
2948
+ int acl_mode = mode;
2949
+ if (mode == 0) {
2950
+ acl_mode = 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2951
  }
 
 
 
 
 
 
2952
 
2953
+ aclTensor* acl_x = ggml_cann_create_tensor(src0);
 
 
 
 
2954
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2955
+ ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
2956
+ acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
2957
+ if (workspaceSize > 0) {
2958
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2959
+ workspaceAddr = workspace_allocator.get();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2960
  }
2961
 
2962
+ ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
2963
+ executor, ctx.stream()));
2964
+
2965
+ ACL_CHECK(aclDestroyTensor(acl_x));
2966
  ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
2967
+ ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
 
 
 
2968
  ACL_CHECK(aclDestroyTensor(acl_dst));
2969
  }
ggml/src/ggml-cann/ggml-cann.cpp CHANGED
@@ -1669,12 +1669,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1669
  }
1670
  case GGML_OP_MUL_MAT: {
1671
  switch (op->src[0]->type) {
1672
- case GGML_TYPE_F16:
1673
- case GGML_TYPE_F32:
1674
  case GGML_TYPE_Q8_0:
1675
- // TODO: fix me
1676
  // Current groupsize should not be greater than k-1 in
1677
- // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
 
 
 
 
 
1678
  case GGML_TYPE_Q4_0:
1679
  return true;
1680
  default:
@@ -1706,9 +1708,61 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1706
  return false;
1707
  }
1708
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1709
  case GGML_OP_DUP:
1710
  case GGML_OP_REPEAT:
1711
- case GGML_OP_CONCAT:
1712
  case GGML_OP_NONE:
1713
  case GGML_OP_RESHAPE:
1714
  case GGML_OP_VIEW:
@@ -1722,17 +1776,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1722
  case GGML_OP_SCALE:
1723
  case GGML_OP_SQR:
1724
  case GGML_OP_CLAMP:
1725
- case GGML_OP_CONT:
1726
  case GGML_OP_DIAG_MASK_INF:
1727
  case GGML_OP_SOFT_MAX:
1728
- case GGML_OP_ROPE:
1729
- case GGML_OP_IM2COL:
1730
  case GGML_OP_POOL_2D:
1731
  case GGML_OP_SUM_ROWS:
1732
  case GGML_OP_ARGSORT:
1733
  case GGML_OP_ACC:
1734
  case GGML_OP_GROUP_NORM:
1735
- case GGML_OP_UPSCALE:
1736
  case GGML_OP_PAD:
1737
  case GGML_OP_ARANGE:
1738
  case GGML_OP_TIMESTEP_EMBEDDING:
 
1669
  }
1670
  case GGML_OP_MUL_MAT: {
1671
  switch (op->src[0]->type) {
 
 
1672
  case GGML_TYPE_Q8_0:
 
1673
  // Current groupsize should not be greater than k-1 in
1674
+ // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
1675
+ if (op->src[0]->ne[0] <= QK8_0) {
1676
+ return false;
1677
+ }
1678
+ case GGML_TYPE_F16:
1679
+ case GGML_TYPE_F32:
1680
  case GGML_TYPE_Q4_0:
1681
  return true;
1682
  default:
 
1708
  return false;
1709
  }
1710
  }
1711
+ case GGML_OP_CONT: {
1712
+ // TODO: support GGML_TYPE_BF16
1713
+ switch (op->src[0]->type) {
1714
+ case GGML_TYPE_F32:
1715
+ case GGML_TYPE_F16:
1716
+ return true;
1717
+ default:
1718
+ return false;
1719
+ }
1720
+ }
1721
+ case GGML_OP_ROPE: {
1722
+ // TODO: with ops-test v == 1
1723
+ float * freq_scale = (float*)((int32_t*)op->op_params + 6);
1724
+ float * ext_factor = (float*)((int32_t*)op->op_params + 7);
1725
+ float * attn_factor = (float*)((int32_t*)op->op_params + 8);
1726
+ // TODO: with freq_factors
1727
+ if (op->src[2] != NULL) {
1728
+ return false;
1729
+ }
1730
+ // TODO: n_dims <= ne0
1731
+ if (op->src[0]->ne[0] != op->op_params[1]) {
1732
+ return false;
1733
+ }
1734
+ // TODO: ext_factor != 0
1735
+ if (*ext_factor != 0) {
1736
+ return false;
1737
+ }
1738
+ // TODO: freq_scale != 1
1739
+ if (*freq_scale != 1) {
1740
+ return false;
1741
+ }
1742
+ // TODO: attn_factor != 1
1743
+ if (*attn_factor != 1) {
1744
+ return false;
1745
+ }
1746
+ //TODO: type == GGML_TYPE_F16
1747
+ switch (op->src[0]->type) {
1748
+ case GGML_TYPE_F32:
1749
+ return true;
1750
+ default:
1751
+ return false;
1752
+ }
1753
+ }
1754
+ case GGML_OP_UPSCALE: {
1755
+ // aclnnUpsampleNearest2dGetWorkspaceSize not support
1756
+ // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
1757
+ if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
1758
+ return false;
1759
+ }
1760
+ return true;
1761
+ }
1762
+ case GGML_OP_IM2COL:
1763
+ case GGML_OP_CONCAT:
1764
  case GGML_OP_DUP:
1765
  case GGML_OP_REPEAT:
 
1766
  case GGML_OP_NONE:
1767
  case GGML_OP_RESHAPE:
1768
  case GGML_OP_VIEW:
 
1776
  case GGML_OP_SCALE:
1777
  case GGML_OP_SQR:
1778
  case GGML_OP_CLAMP:
 
1779
  case GGML_OP_DIAG_MASK_INF:
1780
  case GGML_OP_SOFT_MAX:
 
 
1781
  case GGML_OP_POOL_2D:
1782
  case GGML_OP_SUM_ROWS:
1783
  case GGML_OP_ARGSORT:
1784
  case GGML_OP_ACC:
1785
  case GGML_OP_GROUP_NORM:
 
1786
  case GGML_OP_PAD:
1787
  case GGML_OP_ARANGE:
1788
  case GGML_OP_TIMESTEP_EMBEDDING: