Spaces:
Sleeping
Sleeping
leo-pony
commited on
Commit
·
c9e03e6
1
Parent(s):
69ae50d
CANN: Support Ascend310P to accelerate F32 and F16 Model (llama/10216)
Browse files* CANN Support Ascend310P to accelerate F32 and F16 Model
* Add compile option soc type macro ASCEND_310P to ggml-cann lib
* Remove unused code
* Remove the ascend soc_type hard code compile option in CMakelist.txt
- ggml/src/ggml-cann/CMakeLists.txt +29 -0
- ggml/src/ggml-cann/aclnn_ops.cpp +18 -0
- ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- ggml/src/ggml-cann/kernels/dup.cpp +23 -9
- ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +4 -1
ggml/src/ggml-cann/CMakeLists.txt
CHANGED
|
@@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM
|
|
| 3 |
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
|
| 4 |
endif()
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
if (CANN_INSTALL_DIR)
|
| 7 |
# Only Support Linux.
|
| 8 |
if (NOT UNIX)
|
|
@@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR)
|
|
| 39 |
target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
|
| 40 |
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
|
| 41 |
|
|
|
|
|
|
|
| 42 |
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
|
| 43 |
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
|
| 44 |
else()
|
|
|
|
| 3 |
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
|
| 4 |
endif()
|
| 5 |
|
| 6 |
+
# Auto-detech Soc type and Soc version, if detect failed, will abort build
|
| 7 |
+
set(SOC_VERSION "")
|
| 8 |
+
function(detect_ascend_soc_type SOC_VERSION)
|
| 9 |
+
execute_process(
|
| 10 |
+
COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
|
| 11 |
+
OUTPUT_VARIABLE npu_info
|
| 12 |
+
RESULT_VARIABLE npu_result
|
| 13 |
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
| 14 |
+
)
|
| 15 |
+
if("${npu_info}" STREQUAL "" OR ${npu_result})
|
| 16 |
+
message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
|
| 17 |
+
endif()
|
| 18 |
+
set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
|
| 19 |
+
endfunction()
|
| 20 |
+
|
| 21 |
+
if(NOT SOC_TYPE)
|
| 22 |
+
detect_ascend_soc_type(SOC_VERSION)
|
| 23 |
+
set(SOC_TYPE "${SOC_VERSION}")
|
| 24 |
+
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
|
| 25 |
+
else()
|
| 26 |
+
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
|
| 27 |
+
endif()
|
| 28 |
+
|
| 29 |
+
# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
|
| 30 |
+
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
|
| 31 |
+
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
| 32 |
+
|
| 33 |
if (CANN_INSTALL_DIR)
|
| 34 |
# Only Support Linux.
|
| 35 |
if (NOT UNIX)
|
|
|
|
| 66 |
target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
|
| 67 |
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
|
| 68 |
|
| 69 |
+
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
| 70 |
+
|
| 71 |
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
|
| 72 |
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
|
| 73 |
else()
|
ggml/src/ggml-cann/aclnn_ops.cpp
CHANGED
|
@@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
| 2312 |
|
| 2313 |
switch (src0->type) {
|
| 2314 |
case GGML_TYPE_F32:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2315 |
aclrtlaunch_ascendc_get_row_f32(
|
| 2316 |
24, ctx.stream(), src0->data, src1->data, dst->data,
|
| 2317 |
((ggml_tensor*)src0->extra)->ne,
|
|
@@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
| 2320 |
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
| 2321 |
((ggml_tensor*)dst->extra)->nb);
|
| 2322 |
break;
|
|
|
|
| 2323 |
case GGML_TYPE_F16:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2324 |
aclrtlaunch_ascendc_get_row_f16(
|
| 2325 |
24, ctx.stream(), src0->data, src1->data, dst->data,
|
| 2326 |
((ggml_tensor*)src0->extra)->ne,
|
|
@@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
| 2329 |
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
| 2330 |
((ggml_tensor*)dst->extra)->nb);
|
| 2331 |
break;
|
|
|
|
| 2332 |
case GGML_TYPE_Q4_0:
|
| 2333 |
aclrtlaunch_ascendc_get_row_q4_0(
|
| 2334 |
24, ctx.stream(), src0->data, src1->data, dst->data,
|
|
|
|
| 2312 |
|
| 2313 |
switch (src0->type) {
|
| 2314 |
case GGML_TYPE_F32:
|
| 2315 |
+
{
|
| 2316 |
+
#ifdef ASCEND_310P
|
| 2317 |
+
// Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
|
| 2318 |
+
if ((src0->ne[0] % 8) != 0) {
|
| 2319 |
+
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
|
| 2320 |
+
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
| 2321 |
+
}
|
| 2322 |
+
#endif
|
| 2323 |
aclrtlaunch_ascendc_get_row_f32(
|
| 2324 |
24, ctx.stream(), src0->data, src1->data, dst->data,
|
| 2325 |
((ggml_tensor*)src0->extra)->ne,
|
|
|
|
| 2328 |
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
| 2329 |
((ggml_tensor*)dst->extra)->nb);
|
| 2330 |
break;
|
| 2331 |
+
}
|
| 2332 |
case GGML_TYPE_F16:
|
| 2333 |
+
{
|
| 2334 |
+
#ifdef ASCEND_310P
|
| 2335 |
+
// Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
|
| 2336 |
+
if ((src0->ne[0] % 16) != 0) {
|
| 2337 |
+
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
|
| 2338 |
+
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
| 2339 |
+
}
|
| 2340 |
+
#endif
|
| 2341 |
aclrtlaunch_ascendc_get_row_f16(
|
| 2342 |
24, ctx.stream(), src0->data, src1->data, dst->data,
|
| 2343 |
((ggml_tensor*)src0->extra)->ne,
|
|
|
|
| 2346 |
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
| 2347 |
((ggml_tensor*)dst->extra)->nb);
|
| 2348 |
break;
|
| 2349 |
+
}
|
| 2350 |
case GGML_TYPE_Q4_0:
|
| 2351 |
aclrtlaunch_ascendc_get_row_q4_0(
|
| 2352 |
24, ctx.stream(), src0->data, src1->data, dst->data,
|
ggml/src/ggml-cann/kernels/CMakeLists.txt
CHANGED
|
@@ -1,7 +1,3 @@
|
|
| 1 |
-
if (NOT SOC_TYPE)
|
| 2 |
-
set (SOC_TYPE "Ascend910B3")
|
| 3 |
-
endif()
|
| 4 |
-
|
| 5 |
file(GLOB SRC_FILES
|
| 6 |
get_row_f32.cpp
|
| 7 |
get_row_f16.cpp
|
|
@@ -13,7 +9,6 @@ file(GLOB SRC_FILES
|
|
| 13 |
dup.cpp
|
| 14 |
)
|
| 15 |
|
| 16 |
-
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
|
| 17 |
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
|
| 18 |
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
|
| 19 |
|
|
@@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC
|
|
| 30 |
${SRC_FILES}
|
| 31 |
)
|
| 32 |
|
|
|
|
|
|
|
| 33 |
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
file(GLOB SRC_FILES
|
| 2 |
get_row_f32.cpp
|
| 3 |
get_row_f16.cpp
|
|
|
|
| 9 |
dup.cpp
|
| 10 |
)
|
| 11 |
|
|
|
|
| 12 |
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
|
| 13 |
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
|
| 14 |
|
|
|
|
| 25 |
${SRC_FILES}
|
| 26 |
)
|
| 27 |
|
| 28 |
+
message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
|
| 29 |
+
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
| 30 |
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
ggml/src/ggml-cann/kernels/dup.cpp
CHANGED
|
@@ -5,6 +5,7 @@
|
|
| 5 |
using namespace AscendC;
|
| 6 |
|
| 7 |
#define BUFFER_NUM 2
|
|
|
|
| 8 |
|
| 9 |
template <typename SRC_T, typename DST_T>
|
| 10 |
class DupByRows {
|
|
@@ -19,6 +20,7 @@ class DupByRows {
|
|
| 19 |
// Input has four dims.
|
| 20 |
int64_t op_block_num = GetBlockNum();
|
| 21 |
int64_t op_block_idx = GetBlockIdx();
|
|
|
|
| 22 |
|
| 23 |
// param
|
| 24 |
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
|
|
@@ -51,24 +53,36 @@ class DupByRows {
|
|
| 51 |
|
| 52 |
__aicore__ inline void copy_in() {
|
| 53 |
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
DataCopyPadExtParams<SRC_T> padParams;
|
| 59 |
-
DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
|
| 60 |
-
|
| 61 |
src_queue.EnQue(src_local);
|
| 62 |
}
|
| 63 |
|
| 64 |
__aicore__ inline void copy_out() {
|
| 65 |
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
DataCopyExtParams dataCopyParams;
|
| 68 |
dataCopyParams.blockCount = 1;
|
| 69 |
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
|
| 70 |
DataCopyPad(dst_gm, dst_local, dataCopyParams);
|
| 71 |
-
|
| 72 |
dst_queue.FreeTensor(dst_local);
|
| 73 |
}
|
| 74 |
|
|
|
|
| 5 |
using namespace AscendC;
|
| 6 |
|
| 7 |
#define BUFFER_NUM 2
|
| 8 |
+
const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
|
| 9 |
|
| 10 |
template <typename SRC_T, typename DST_T>
|
| 11 |
class DupByRows {
|
|
|
|
| 20 |
// Input has four dims.
|
| 21 |
int64_t op_block_num = GetBlockNum();
|
| 22 |
int64_t op_block_idx = GetBlockIdx();
|
| 23 |
+
assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
|
| 24 |
|
| 25 |
// param
|
| 26 |
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
|
|
|
|
| 53 |
|
| 54 |
__aicore__ inline void copy_in() {
|
| 55 |
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
|
| 56 |
+
const size_t elem_per_block = 32 / sizeof(SRC_T);
|
| 57 |
+
size_t tail = num_elem % elem_per_block;
|
| 58 |
+
size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
|
| 59 |
+
DataCopy(src_local, src_gm, cpy_elements_len);
|
|
|
|
|
|
|
|
|
|
| 60 |
src_queue.EnQue(src_local);
|
| 61 |
}
|
| 62 |
|
| 63 |
__aicore__ inline void copy_out() {
|
| 64 |
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
|
| 65 |
+
#ifdef ASCEND_310P
|
| 66 |
+
const size_t elem_per_block = 32 / sizeof(DST_T);
|
| 67 |
+
size_t tail = num_elem % elem_per_block;
|
| 68 |
+
size_t len = num_elem & ~(elem_per_block - 1);
|
| 69 |
+
if (len > 0) {
|
| 70 |
+
DataCopy(dst_gm, dst_local, len);
|
| 71 |
+
}
|
| 72 |
+
if(tail != 0) {
|
| 73 |
+
for (size_t i = tail; i < elem_per_block; i++) {
|
| 74 |
+
dst_local[len + i].SetValue(0, 0);
|
| 75 |
+
}
|
| 76 |
+
SetAtomicAdd<float>();
|
| 77 |
+
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
|
| 78 |
+
SetAtomicNone();
|
| 79 |
+
}
|
| 80 |
+
#else
|
| 81 |
DataCopyExtParams dataCopyParams;
|
| 82 |
dataCopyParams.blockCount = 1;
|
| 83 |
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
|
| 84 |
DataCopyPad(dst_gm, dst_local, dataCopyParams);
|
| 85 |
+
#endif
|
| 86 |
dst_queue.FreeTensor(dst_local);
|
| 87 |
}
|
| 88 |
|
ggml/src/ggml-cann/kernels/get_row_f16.cpp
CHANGED
|
@@ -14,7 +14,7 @@ class GET_ROW_F16 {
|
|
| 14 |
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
| 15 |
// TODO, use template for F16/f32
|
| 16 |
int64_t op_block_num = GetBlockNum();
|
| 17 |
-
|
| 18 |
|
| 19 |
for (int i = 0; i < 4; i++) {
|
| 20 |
input_ne[i] = input_ne_ub[i];
|
|
@@ -59,32 +59,42 @@ class GET_ROW_F16 {
|
|
| 59 |
}
|
| 60 |
|
| 61 |
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
|
|
|
| 62 |
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
|
| 63 |
-
size_t
|
| 64 |
-
|
| 65 |
-
|
| 66 |
if(tail != 0) {
|
| 67 |
-
|
| 68 |
-
dataCopyParams.blockCount = 1;
|
| 69 |
-
dataCopyParams.blockLen = tail * sizeof(half);
|
| 70 |
-
DataCopyPadExtParams<half> padParams;
|
| 71 |
-
DataCopyPad(input_local[len], input_gm[offset + len],
|
| 72 |
-
dataCopyParams, padParams);
|
| 73 |
}
|
|
|
|
| 74 |
input_queue.EnQue(input_local);
|
| 75 |
}
|
| 76 |
|
| 77 |
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
| 78 |
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
| 79 |
-
size_t
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
if(tail != 0) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
DataCopyExtParams dataCopyParams;
|
| 84 |
dataCopyParams.blockCount = 1;
|
| 85 |
dataCopyParams.blockLen = tail * sizeof(float);
|
| 86 |
DataCopyPad(output_gm[offset + len], output_local[len],
|
| 87 |
dataCopyParams);
|
|
|
|
| 88 |
}
|
| 89 |
output_queue.FreeTensor(output_local);
|
| 90 |
}
|
|
@@ -150,6 +160,7 @@ class GET_ROW_F16 {
|
|
| 150 |
GlobalTensor<float> output_gm;
|
| 151 |
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
| 152 |
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
|
|
|
| 153 |
};
|
| 154 |
|
| 155 |
template <typename T>
|
|
|
|
| 14 |
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
| 15 |
// TODO, use template for F16/f32
|
| 16 |
int64_t op_block_num = GetBlockNum();
|
| 17 |
+
op_block_idx = GetBlockIdx();
|
| 18 |
|
| 19 |
for (int i = 0; i < 4; i++) {
|
| 20 |
input_ne[i] = input_ne_ub[i];
|
|
|
|
| 59 |
}
|
| 60 |
|
| 61 |
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
| 62 |
+
size_t origin_len = len;
|
| 63 |
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
|
| 64 |
+
const size_t elem_per_block = 32 / sizeof(half);
|
| 65 |
+
size_t tail = len % elem_per_block;
|
| 66 |
+
len = len & ~(elem_per_block - 1);
|
| 67 |
if(tail != 0) {
|
| 68 |
+
len += elem_per_block;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
}
|
| 70 |
+
DataCopy(input_local, input_gm[offset], len);
|
| 71 |
input_queue.EnQue(input_local);
|
| 72 |
}
|
| 73 |
|
| 74 |
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
| 75 |
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
| 76 |
+
const size_t elem_per_block = 32 / sizeof(float);
|
| 77 |
+
size_t tail = len % elem_per_block;
|
| 78 |
+
len = len & ~(elem_per_block - 1);
|
| 79 |
+
if (len > 0) {
|
| 80 |
+
DataCopy(output_gm[offset], output_local, len);
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
if(tail != 0) {
|
| 84 |
+
#ifdef ASCEND_310P
|
| 85 |
+
for (size_t i = tail; i < elem_per_block; i++) {
|
| 86 |
+
output_local[len + i].SetValue(0, 0);
|
| 87 |
+
}
|
| 88 |
+
SetAtomicAdd<float>();
|
| 89 |
+
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
|
| 90 |
+
SetAtomicNone();
|
| 91 |
+
#else
|
| 92 |
DataCopyExtParams dataCopyParams;
|
| 93 |
dataCopyParams.blockCount = 1;
|
| 94 |
dataCopyParams.blockLen = tail * sizeof(float);
|
| 95 |
DataCopyPad(output_gm[offset + len], output_local[len],
|
| 96 |
dataCopyParams);
|
| 97 |
+
#endif
|
| 98 |
}
|
| 99 |
output_queue.FreeTensor(output_local);
|
| 100 |
}
|
|
|
|
| 160 |
GlobalTensor<float> output_gm;
|
| 161 |
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
| 162 |
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
| 163 |
+
int64_t op_block_idx;
|
| 164 |
};
|
| 165 |
|
| 166 |
template <typename T>
|
ggml/src/ggml-cann/kernels/get_row_f32.cpp
CHANGED
|
@@ -13,7 +13,7 @@ class GET_ROW_F32 {
|
|
| 13 |
int64_t *indices_ne_ub, size_t *indices_nb_ub,
|
| 14 |
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
| 15 |
int64_t op_block_num = GetBlockNum();
|
| 16 |
-
|
| 17 |
|
| 18 |
for (int i = 0; i < 4; i++) {
|
| 19 |
input_ne[i] = input_ne_ub[i];
|
|
@@ -55,31 +55,40 @@ class GET_ROW_F32 {
|
|
| 55 |
|
| 56 |
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
| 57 |
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
|
| 58 |
-
size_t
|
| 59 |
-
|
| 60 |
-
|
| 61 |
if(tail != 0) {
|
| 62 |
-
|
| 63 |
-
dataCopyParams.blockCount = 1;
|
| 64 |
-
dataCopyParams.blockLen = tail * sizeof(float);
|
| 65 |
-
DataCopyPadExtParams<float> padParams;
|
| 66 |
-
DataCopyPad(input_local[len], input_gm[offset + len],
|
| 67 |
-
dataCopyParams, padParams);
|
| 68 |
}
|
|
|
|
| 69 |
input_queue.EnQue(input_local);
|
| 70 |
}
|
| 71 |
|
| 72 |
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
| 73 |
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
| 74 |
-
size_t
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
if(tail != 0) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
DataCopyExtParams dataCopyParams;
|
| 79 |
dataCopyParams.blockCount = 1;
|
| 80 |
dataCopyParams.blockLen = tail * sizeof(float);
|
| 81 |
DataCopyPad(output_gm[offset + len], output_local[len],
|
| 82 |
dataCopyParams);
|
|
|
|
| 83 |
}
|
| 84 |
output_queue.FreeTensor(output_local);
|
| 85 |
}
|
|
@@ -144,6 +153,7 @@ class GET_ROW_F32 {
|
|
| 144 |
GlobalTensor<float> output_gm;
|
| 145 |
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
| 146 |
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
|
|
|
| 147 |
};
|
| 148 |
|
| 149 |
template <typename T>
|
|
|
|
| 13 |
int64_t *indices_ne_ub, size_t *indices_nb_ub,
|
| 14 |
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
| 15 |
int64_t op_block_num = GetBlockNum();
|
| 16 |
+
op_block_idx = GetBlockIdx();
|
| 17 |
|
| 18 |
for (int i = 0; i < 4; i++) {
|
| 19 |
input_ne[i] = input_ne_ub[i];
|
|
|
|
| 55 |
|
| 56 |
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
| 57 |
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
|
| 58 |
+
const size_t elem_per_block = 32 / sizeof(float);
|
| 59 |
+
size_t tail = len % elem_per_block;
|
| 60 |
+
len = len & ~(elem_per_block - 1);
|
| 61 |
if(tail != 0) {
|
| 62 |
+
len += elem_per_block;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
+
DataCopy(input_local, input_gm[offset], len);
|
| 65 |
input_queue.EnQue(input_local);
|
| 66 |
}
|
| 67 |
|
| 68 |
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
| 69 |
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
| 70 |
+
const size_t elem_per_block = 32 / sizeof(float);
|
| 71 |
+
size_t tail = len % elem_per_block;
|
| 72 |
+
len = len & ~(elem_per_block - 1);
|
| 73 |
+
if (len > 0) {
|
| 74 |
+
DataCopy(output_gm[offset], output_local, len);
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
if(tail != 0) {
|
| 78 |
+
#ifdef ASCEND_310P
|
| 79 |
+
for (size_t i = tail; i < elem_per_block; i++) {
|
| 80 |
+
output_local[len + i].SetValue(0, 0);
|
| 81 |
+
}
|
| 82 |
+
SetAtomicAdd<float>();
|
| 83 |
+
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
|
| 84 |
+
SetAtomicNone();
|
| 85 |
+
#else
|
| 86 |
DataCopyExtParams dataCopyParams;
|
| 87 |
dataCopyParams.blockCount = 1;
|
| 88 |
dataCopyParams.blockLen = tail * sizeof(float);
|
| 89 |
DataCopyPad(output_gm[offset + len], output_local[len],
|
| 90 |
dataCopyParams);
|
| 91 |
+
#endif
|
| 92 |
}
|
| 93 |
output_queue.FreeTensor(output_local);
|
| 94 |
}
|
|
|
|
| 153 |
GlobalTensor<float> output_gm;
|
| 154 |
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
| 155 |
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
| 156 |
+
int64_t op_block_idx;
|
| 157 |
};
|
| 158 |
|
| 159 |
template <typename T>
|
ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
CHANGED
|
@@ -110,9 +110,12 @@ class GET_ROW_Q4_0 {
|
|
| 110 |
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
| 111 |
|
| 112 |
// TODO: cast more data to speed up.
|
|
|
|
|
|
|
|
|
|
| 113 |
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
|
| 114 |
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
|
| 115 |
-
|
| 116 |
// Only mul need compile by group.
|
| 117 |
half scale = scale_gm.GetValue(scale_offset);
|
| 118 |
|
|
|
|
| 110 |
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
| 111 |
|
| 112 |
// TODO: cast more data to speed up.
|
| 113 |
+
#ifdef ASCEND_310P
|
| 114 |
+
// TODO: 310P support quantification
|
| 115 |
+
#else
|
| 116 |
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
|
| 117 |
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
|
| 118 |
+
#endif
|
| 119 |
// Only mul need compile by group.
|
| 120 |
half scale = scale_gm.GetValue(scale_offset);
|
| 121 |
|