leo-pony commited on
Commit
c9e03e6
·
1 Parent(s): 69ae50d

CANN: Support Ascend310P to accelerate F32 and F16 Model (llama/10216)

Browse files

* CANN Support Ascend310P to accelerate F32 and F16 Model

* Add compile option soc type macro ASCEND_310P to ggml-cann lib

* Remove unused code

* Remove the ascend soc_type hard code compile option in CMakelist.txt

ggml/src/ggml-cann/CMakeLists.txt CHANGED
@@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM
3
  message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
4
  endif()
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  if (CANN_INSTALL_DIR)
7
  # Only Support Linux.
8
  if (NOT UNIX)
@@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR)
39
  target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
40
  target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
41
 
 
 
42
  message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
43
  message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
44
  else()
 
3
  message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
4
  endif()
5
 
6
+ # Auto-detech Soc type and Soc version, if detect failed, will abort build
7
+ set(SOC_VERSION "")
8
+ function(detect_ascend_soc_type SOC_VERSION)
9
+ execute_process(
10
+ COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
11
+ OUTPUT_VARIABLE npu_info
12
+ RESULT_VARIABLE npu_result
13
+ OUTPUT_STRIP_TRAILING_WHITESPACE
14
+ )
15
+ if("${npu_info}" STREQUAL "" OR ${npu_result})
16
+ message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
17
+ endif()
18
+ set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
19
+ endfunction()
20
+
21
+ if(NOT SOC_TYPE)
22
+ detect_ascend_soc_type(SOC_VERSION)
23
+ set(SOC_TYPE "${SOC_VERSION}")
24
+ message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
25
+ else()
26
+ string(TOLOWER ${SOC_TYPE} SOC_VERSION)
27
+ endif()
28
+
29
+ # Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
30
+ string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
31
+ set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
32
+
33
  if (CANN_INSTALL_DIR)
34
  # Only Support Linux.
35
  if (NOT UNIX)
 
66
  target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
67
  target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
68
 
69
+ target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
70
+
71
  message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
72
  message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
73
  else()
ggml/src/ggml-cann/aclnn_ops.cpp CHANGED
@@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2312
 
2313
  switch (src0->type) {
2314
  case GGML_TYPE_F32:
 
 
 
 
 
 
 
 
2315
  aclrtlaunch_ascendc_get_row_f32(
2316
  24, ctx.stream(), src0->data, src1->data, dst->data,
2317
  ((ggml_tensor*)src0->extra)->ne,
@@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2320
  ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2321
  ((ggml_tensor*)dst->extra)->nb);
2322
  break;
 
2323
  case GGML_TYPE_F16:
 
 
 
 
 
 
 
 
2324
  aclrtlaunch_ascendc_get_row_f16(
2325
  24, ctx.stream(), src0->data, src1->data, dst->data,
2326
  ((ggml_tensor*)src0->extra)->ne,
@@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2329
  ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2330
  ((ggml_tensor*)dst->extra)->nb);
2331
  break;
 
2332
  case GGML_TYPE_Q4_0:
2333
  aclrtlaunch_ascendc_get_row_q4_0(
2334
  24, ctx.stream(), src0->data, src1->data, dst->data,
 
2312
 
2313
  switch (src0->type) {
2314
  case GGML_TYPE_F32:
2315
+ {
2316
+ #ifdef ASCEND_310P
2317
+ // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
2318
+ if ((src0->ne[0] % 8) != 0) {
2319
+ size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
2320
+ ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2321
+ }
2322
+ #endif
2323
  aclrtlaunch_ascendc_get_row_f32(
2324
  24, ctx.stream(), src0->data, src1->data, dst->data,
2325
  ((ggml_tensor*)src0->extra)->ne,
 
2328
  ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2329
  ((ggml_tensor*)dst->extra)->nb);
2330
  break;
2331
+ }
2332
  case GGML_TYPE_F16:
2333
+ {
2334
+ #ifdef ASCEND_310P
2335
+ // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
2336
+ if ((src0->ne[0] % 16) != 0) {
2337
+ size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
2338
+ ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2339
+ }
2340
+ #endif
2341
  aclrtlaunch_ascendc_get_row_f16(
2342
  24, ctx.stream(), src0->data, src1->data, dst->data,
2343
  ((ggml_tensor*)src0->extra)->ne,
 
2346
  ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2347
  ((ggml_tensor*)dst->extra)->nb);
2348
  break;
2349
+ }
2350
  case GGML_TYPE_Q4_0:
2351
  aclrtlaunch_ascendc_get_row_q4_0(
2352
  24, ctx.stream(), src0->data, src1->data, dst->data,
ggml/src/ggml-cann/kernels/CMakeLists.txt CHANGED
@@ -1,7 +1,3 @@
1
- if (NOT SOC_TYPE)
2
- set (SOC_TYPE "Ascend910B3")
3
- endif()
4
-
5
  file(GLOB SRC_FILES
6
  get_row_f32.cpp
7
  get_row_f16.cpp
@@ -13,7 +9,6 @@ file(GLOB SRC_FILES
13
  dup.cpp
14
  )
15
 
16
- string(TOLOWER ${SOC_TYPE} SOC_VERSION)
17
  set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
18
  set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
19
 
@@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC
30
  ${SRC_FILES}
31
  )
32
 
 
 
33
  # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
 
 
 
 
 
1
  file(GLOB SRC_FILES
2
  get_row_f32.cpp
3
  get_row_f16.cpp
 
9
  dup.cpp
10
  )
11
 
 
12
  set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
13
  set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
14
 
 
25
  ${SRC_FILES}
26
  )
27
 
28
+ message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
29
+ ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
30
  # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
ggml/src/ggml-cann/kernels/dup.cpp CHANGED
@@ -5,6 +5,7 @@
5
  using namespace AscendC;
6
 
7
  #define BUFFER_NUM 2
 
8
 
9
  template <typename SRC_T, typename DST_T>
10
  class DupByRows {
@@ -19,6 +20,7 @@ class DupByRows {
19
  // Input has four dims.
20
  int64_t op_block_num = GetBlockNum();
21
  int64_t op_block_idx = GetBlockIdx();
 
22
 
23
  // param
24
  num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
@@ -51,24 +53,36 @@ class DupByRows {
51
 
52
  __aicore__ inline void copy_in() {
53
  LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
54
-
55
- DataCopyExtParams dataCopyParams;
56
- dataCopyParams.blockCount = 1;
57
- dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
58
- DataCopyPadExtParams<SRC_T> padParams;
59
- DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
60
-
61
  src_queue.EnQue(src_local);
62
  }
63
 
64
  __aicore__ inline void copy_out() {
65
  LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
66
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  DataCopyExtParams dataCopyParams;
68
  dataCopyParams.blockCount = 1;
69
  dataCopyParams.blockLen = num_elem * sizeof(DST_T);
70
  DataCopyPad(dst_gm, dst_local, dataCopyParams);
71
-
72
  dst_queue.FreeTensor(dst_local);
73
  }
74
 
 
5
  using namespace AscendC;
6
 
7
  #define BUFFER_NUM 2
8
+ const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
9
 
10
  template <typename SRC_T, typename DST_T>
11
  class DupByRows {
 
20
  // Input has four dims.
21
  int64_t op_block_num = GetBlockNum();
22
  int64_t op_block_idx = GetBlockIdx();
23
+ assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
24
 
25
  // param
26
  num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
 
53
 
54
  __aicore__ inline void copy_in() {
55
  LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
56
+ const size_t elem_per_block = 32 / sizeof(SRC_T);
57
+ size_t tail = num_elem % elem_per_block;
58
+ size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
59
+ DataCopy(src_local, src_gm, cpy_elements_len);
 
 
 
60
  src_queue.EnQue(src_local);
61
  }
62
 
63
  __aicore__ inline void copy_out() {
64
  LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
65
+ #ifdef ASCEND_310P
66
+ const size_t elem_per_block = 32 / sizeof(DST_T);
67
+ size_t tail = num_elem % elem_per_block;
68
+ size_t len = num_elem & ~(elem_per_block - 1);
69
+ if (len > 0) {
70
+ DataCopy(dst_gm, dst_local, len);
71
+ }
72
+ if(tail != 0) {
73
+ for (size_t i = tail; i < elem_per_block; i++) {
74
+ dst_local[len + i].SetValue(0, 0);
75
+ }
76
+ SetAtomicAdd<float>();
77
+ DataCopy(dst_gm[len], dst_local[len], elem_per_block);
78
+ SetAtomicNone();
79
+ }
80
+ #else
81
  DataCopyExtParams dataCopyParams;
82
  dataCopyParams.blockCount = 1;
83
  dataCopyParams.blockLen = num_elem * sizeof(DST_T);
84
  DataCopyPad(dst_gm, dst_local, dataCopyParams);
85
+ #endif
86
  dst_queue.FreeTensor(dst_local);
87
  }
88
 
ggml/src/ggml-cann/kernels/get_row_f16.cpp CHANGED
@@ -14,7 +14,7 @@ class GET_ROW_F16 {
14
  int64_t *output_ne_ub, size_t *output_nb_ub) {
15
  // TODO, use template for F16/f32
16
  int64_t op_block_num = GetBlockNum();
17
- int64_t op_block_idx = GetBlockIdx();
18
 
19
  for (int i = 0; i < 4; i++) {
20
  input_ne[i] = input_ne_ub[i];
@@ -59,32 +59,42 @@ class GET_ROW_F16 {
59
  }
60
 
61
  __aicore__ inline void copy_in(uint32_t offset, size_t len) {
 
62
  LocalTensor<half> input_local = input_queue.AllocTensor<half>();
63
- size_t tail = len % 32;
64
- len = len & ~31;
65
- DataCopy(input_local, input_gm[offset], len);
66
  if(tail != 0) {
67
- DataCopyExtParams dataCopyParams;
68
- dataCopyParams.blockCount = 1;
69
- dataCopyParams.blockLen = tail * sizeof(half);
70
- DataCopyPadExtParams<half> padParams;
71
- DataCopyPad(input_local[len], input_gm[offset + len],
72
- dataCopyParams, padParams);
73
  }
 
74
  input_queue.EnQue(input_local);
75
  }
76
 
77
  __aicore__ inline void copy_out(uint32_t offset, size_t len) {
78
  LocalTensor<float> output_local = output_queue.DeQue<float>();
79
- size_t tail = len % 32;
80
- len = len & ~31;
81
- DataCopy(output_gm[offset], output_local, len);
 
 
 
 
82
  if(tail != 0) {
 
 
 
 
 
 
 
 
83
  DataCopyExtParams dataCopyParams;
84
  dataCopyParams.blockCount = 1;
85
  dataCopyParams.blockLen = tail * sizeof(float);
86
  DataCopyPad(output_gm[offset + len], output_local[len],
87
  dataCopyParams);
 
88
  }
89
  output_queue.FreeTensor(output_local);
90
  }
@@ -150,6 +160,7 @@ class GET_ROW_F16 {
150
  GlobalTensor<float> output_gm;
151
  TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
152
  TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
 
153
  };
154
 
155
  template <typename T>
 
14
  int64_t *output_ne_ub, size_t *output_nb_ub) {
15
  // TODO, use template for F16/f32
16
  int64_t op_block_num = GetBlockNum();
17
+ op_block_idx = GetBlockIdx();
18
 
19
  for (int i = 0; i < 4; i++) {
20
  input_ne[i] = input_ne_ub[i];
 
59
  }
60
 
61
  __aicore__ inline void copy_in(uint32_t offset, size_t len) {
62
+ size_t origin_len = len;
63
  LocalTensor<half> input_local = input_queue.AllocTensor<half>();
64
+ const size_t elem_per_block = 32 / sizeof(half);
65
+ size_t tail = len % elem_per_block;
66
+ len = len & ~(elem_per_block - 1);
67
  if(tail != 0) {
68
+ len += elem_per_block;
 
 
 
 
 
69
  }
70
+ DataCopy(input_local, input_gm[offset], len);
71
  input_queue.EnQue(input_local);
72
  }
73
 
74
  __aicore__ inline void copy_out(uint32_t offset, size_t len) {
75
  LocalTensor<float> output_local = output_queue.DeQue<float>();
76
+ const size_t elem_per_block = 32 / sizeof(float);
77
+ size_t tail = len % elem_per_block;
78
+ len = len & ~(elem_per_block - 1);
79
+ if (len > 0) {
80
+ DataCopy(output_gm[offset], output_local, len);
81
+ }
82
+
83
  if(tail != 0) {
84
+ #ifdef ASCEND_310P
85
+ for (size_t i = tail; i < elem_per_block; i++) {
86
+ output_local[len + i].SetValue(0, 0);
87
+ }
88
+ SetAtomicAdd<float>();
89
+ DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
90
+ SetAtomicNone();
91
+ #else
92
  DataCopyExtParams dataCopyParams;
93
  dataCopyParams.blockCount = 1;
94
  dataCopyParams.blockLen = tail * sizeof(float);
95
  DataCopyPad(output_gm[offset + len], output_local[len],
96
  dataCopyParams);
97
+ #endif
98
  }
99
  output_queue.FreeTensor(output_local);
100
  }
 
160
  GlobalTensor<float> output_gm;
161
  TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
162
  TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
163
+ int64_t op_block_idx;
164
  };
165
 
166
  template <typename T>
ggml/src/ggml-cann/kernels/get_row_f32.cpp CHANGED
@@ -13,7 +13,7 @@ class GET_ROW_F32 {
13
  int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
  int64_t *output_ne_ub, size_t *output_nb_ub) {
15
  int64_t op_block_num = GetBlockNum();
16
- int64_t op_block_idx = GetBlockIdx();
17
 
18
  for (int i = 0; i < 4; i++) {
19
  input_ne[i] = input_ne_ub[i];
@@ -55,31 +55,40 @@ class GET_ROW_F32 {
55
 
56
  __aicore__ inline void copy_in(uint32_t offset, size_t len) {
57
  LocalTensor<float> input_local = input_queue.AllocTensor<float>();
58
- size_t tail = len % 32;
59
- len = len & ~31;
60
- DataCopy(input_local, input_gm[offset], len);
61
  if(tail != 0) {
62
- DataCopyExtParams dataCopyParams;
63
- dataCopyParams.blockCount = 1;
64
- dataCopyParams.blockLen = tail * sizeof(float);
65
- DataCopyPadExtParams<float> padParams;
66
- DataCopyPad(input_local[len], input_gm[offset + len],
67
- dataCopyParams, padParams);
68
  }
 
69
  input_queue.EnQue(input_local);
70
  }
71
 
72
  __aicore__ inline void copy_out(uint32_t offset, size_t len) {
73
  LocalTensor<float> output_local = output_queue.DeQue<float>();
74
- size_t tail = len % 32;
75
- len = len & ~31;
76
- DataCopy(output_gm[offset], output_local, len);
 
 
 
 
77
  if(tail != 0) {
 
 
 
 
 
 
 
 
78
  DataCopyExtParams dataCopyParams;
79
  dataCopyParams.blockCount = 1;
80
  dataCopyParams.blockLen = tail * sizeof(float);
81
  DataCopyPad(output_gm[offset + len], output_local[len],
82
  dataCopyParams);
 
83
  }
84
  output_queue.FreeTensor(output_local);
85
  }
@@ -144,6 +153,7 @@ class GET_ROW_F32 {
144
  GlobalTensor<float> output_gm;
145
  TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
146
  TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
 
147
  };
148
 
149
  template <typename T>
 
13
  int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
  int64_t *output_ne_ub, size_t *output_nb_ub) {
15
  int64_t op_block_num = GetBlockNum();
16
+ op_block_idx = GetBlockIdx();
17
 
18
  for (int i = 0; i < 4; i++) {
19
  input_ne[i] = input_ne_ub[i];
 
55
 
56
  __aicore__ inline void copy_in(uint32_t offset, size_t len) {
57
  LocalTensor<float> input_local = input_queue.AllocTensor<float>();
58
+ const size_t elem_per_block = 32 / sizeof(float);
59
+ size_t tail = len % elem_per_block;
60
+ len = len & ~(elem_per_block - 1);
61
  if(tail != 0) {
62
+ len += elem_per_block;
 
 
 
 
 
63
  }
64
+ DataCopy(input_local, input_gm[offset], len);
65
  input_queue.EnQue(input_local);
66
  }
67
 
68
  __aicore__ inline void copy_out(uint32_t offset, size_t len) {
69
  LocalTensor<float> output_local = output_queue.DeQue<float>();
70
+ const size_t elem_per_block = 32 / sizeof(float);
71
+ size_t tail = len % elem_per_block;
72
+ len = len & ~(elem_per_block - 1);
73
+ if (len > 0) {
74
+ DataCopy(output_gm[offset], output_local, len);
75
+ }
76
+
77
  if(tail != 0) {
78
+ #ifdef ASCEND_310P
79
+ for (size_t i = tail; i < elem_per_block; i++) {
80
+ output_local[len + i].SetValue(0, 0);
81
+ }
82
+ SetAtomicAdd<float>();
83
+ DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
84
+ SetAtomicNone();
85
+ #else
86
  DataCopyExtParams dataCopyParams;
87
  dataCopyParams.blockCount = 1;
88
  dataCopyParams.blockLen = tail * sizeof(float);
89
  DataCopyPad(output_gm[offset + len], output_local[len],
90
  dataCopyParams);
91
+ #endif
92
  }
93
  output_queue.FreeTensor(output_local);
94
  }
 
153
  GlobalTensor<float> output_gm;
154
  TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
155
  TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
156
+ int64_t op_block_idx;
157
  };
158
 
159
  template <typename T>
ggml/src/ggml-cann/kernels/get_row_q4_0.cpp CHANGED
@@ -110,9 +110,12 @@ class GET_ROW_Q4_0 {
110
  LocalTensor<float> output_local = output_queue.AllocTensor<float>();
111
 
112
  // TODO: cast more data to speed up.
 
 
 
113
  Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
114
  Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
115
-
116
  // Only mul need compile by group.
117
  half scale = scale_gm.GetValue(scale_offset);
118
 
 
110
  LocalTensor<float> output_local = output_queue.AllocTensor<float>();
111
 
112
  // TODO: cast more data to speed up.
113
+ #ifdef ASCEND_310P
114
+ // TODO: 310P support quantification
115
+ #else
116
  Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
117
  Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
118
+ #endif
119
  // Only mul need compile by group.
120
  half scale = scale_gm.GetValue(scale_offset);
121