Mimi89757 commited on
Commit
94baae9
·
unverified ·
1 Parent(s): 1a699ea

cann : add Ascend NPU support (#2336)

Browse files

* enable Ascend NPU in src/whisper.cpp
* sync test-backend-ops with llama.cpp

ggml/src/ggml-cann/Doxyfile CHANGED
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
32
  # title of most generated pages and in a few other places.
33
  # The default value is: My Project.
34
 
35
- PROJECT_NAME = "llama.cpp"
36
 
37
  # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
38
  # could be handy for archiving the generated documentation or if some version
@@ -44,7 +44,7 @@ PROJECT_NUMBER =
44
  # for a project that appears at the top of each page and should give viewer a
45
  # quick idea about the purpose of the project. Keep the description short.
46
 
47
- PROJECT_BRIEF = "llama inference engine"
48
 
49
  # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
50
  # in the documentation. The maximum height of the logo should not exceed 55
 
32
  # title of most generated pages and in a few other places.
33
  # The default value is: My Project.
34
 
35
+ PROJECT_NAME = "whisper.cpp"
36
 
37
  # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
38
  # could be handy for archiving the generated documentation or if some version
 
44
  # for a project that appears at the top of each page and should give viewer a
45
  # quick idea about the purpose of the project. Keep the description short.
46
 
47
+ PROJECT_BRIEF = "Port of OpenAI's Whisper model in C/C++"
48
 
49
  # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
50
  # in the documentation. The maximum height of the logo should not exceed 55
src/whisper.cpp CHANGED
@@ -29,6 +29,10 @@
29
  #include "openvino/whisper-openvino-encoder.h"
30
  #endif
31
 
 
 
 
 
32
  #include "ggml.h"
33
  #include "ggml-alloc.h"
34
  #include "ggml-backend.h"
@@ -1283,6 +1287,16 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa
1283
  }
1284
  #endif
1285
 
 
 
 
 
 
 
 
 
 
 
1286
  return result;
1287
  }
1288
 
@@ -1335,6 +1349,10 @@ static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_cont
1335
  result || (result = ggml_backend_vk_buffer_type(params.gpu_device));
1336
  #endif
1337
 
 
 
 
 
1338
  result || (result = ggml_backend_cpu_buffer_type());
1339
 
1340
  return result;
@@ -4337,8 +4355,8 @@ const char * whisper_print_system_info(void) {
4337
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
4338
  s += "CUDA = " + std::to_string(ggml_cpu_has_cuda()) + " | ";
4339
  s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
4340
- s += "OPENVINO = " + std::to_string(whisper_has_openvino()) ;
4341
-
4342
  return s.c_str();
4343
  }
4344
 
 
29
  #include "openvino/whisper-openvino-encoder.h"
30
  #endif
31
 
32
+ #ifdef GGML_USE_CANN
33
+ #include "ggml-cann.h"
34
+ #endif
35
+
36
  #include "ggml.h"
37
  #include "ggml-alloc.h"
38
  #include "ggml-backend.h"
 
1287
  }
1288
  #endif
1289
 
1290
+ #ifdef GGML_USE_CANN
1291
+ if (params.use_gpu) {
1292
+ WHISPER_LOG_INFO("%s: using CANN backend\n", __func__);
1293
+ result = ggml_backend_cann_init(params.gpu_device);
1294
+ if (!result) {
1295
+ WHISPER_LOG_ERROR("%s: ggml_backend_cann_init() failed\n", __func__);
1296
+ }
1297
+ }
1298
+ #endif
1299
+
1300
  return result;
1301
  }
1302
 
 
1349
  result || (result = ggml_backend_vk_buffer_type(params.gpu_device));
1350
  #endif
1351
 
1352
+ #ifdef GGML_USE_CANN
1353
+ result || (result == ggml_backend_cann_buffer_type(params.gpu_device));
1354
+ #endif
1355
+
1356
  result || (result = ggml_backend_cpu_buffer_type());
1357
 
1358
  return result;
 
4355
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
4356
  s += "CUDA = " + std::to_string(ggml_cpu_has_cuda()) + " | ";
4357
  s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
4358
+ s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
4359
+ s += "CANN = " + std::to_string(ggml_cpu_has_cann()) ;
4360
  return s.c_str();
4361
  }
4362
 
tests/test-backend-ops.cpp CHANGED
@@ -1,7 +1,6 @@
1
  #include <ggml.h>
2
  #include <ggml-alloc.h>
3
  #include <ggml-backend.h>
4
- #include <ggml-backend-impl.h>
5
 
6
  #include <algorithm>
7
  #include <array>
@@ -80,14 +79,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
80
  im = nullptr;
81
  }
82
  }
 
83
  ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
84
  GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
 
 
 
 
 
 
 
85
  ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
86
  } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
87
  // This is going to create some weird integers though.
88
  ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
89
  } else {
90
- GGML_ASSERT(false);
91
  }
92
  }
93
 
@@ -125,7 +132,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
125
  tt.to_float(&buf[i], vq.data(), bs);
126
  tv.insert(tv.end(), vq.begin(), vq.end());
127
  } else {
128
- GGML_ASSERT(false);
129
  }
130
  }
131
  }
@@ -760,7 +767,7 @@ struct test_dup : public test_case {
760
  }
761
 
762
  test_dup(ggml_type type = GGML_TYPE_F32,
763
- std::array<int64_t, 4> ne = {10, 10, 10, 1},
764
  std::array<int64_t, 4> permute = {0, 0, 0, 0})
765
  : type(type), ne(ne), permute(permute),
766
  _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
@@ -780,9 +787,11 @@ struct test_cpy : public test_case {
780
  const ggml_type type_src;
781
  const ggml_type type_dst;
782
  const std::array<int64_t, 4> ne;
 
 
783
 
784
  std::string vars() override {
785
- return VARS_TO_STR3(type_src, type_dst, ne);
786
  }
787
 
788
  double max_nmse_err() override {
@@ -794,12 +803,17 @@ struct test_cpy : public test_case {
794
  }
795
 
796
  test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
797
- std::array<int64_t, 4> ne = {10, 10, 10, 1})
798
- : type_src(type_src), type_dst(type_dst), ne(ne) {}
 
 
799
 
800
  ggml_tensor * build_graph(ggml_context * ctx) override {
801
  ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
802
- ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
 
 
 
803
  ggml_tensor * out = ggml_cpy(ctx, src, dst);
804
  return out;
805
  }
@@ -1175,6 +1189,7 @@ struct test_soft_max : public test_case {
1175
  }
1176
  };
1177
 
 
1178
  // GGML_OP_ROPE
1179
  struct test_rope : public test_case {
1180
  const ggml_type type;
@@ -1267,6 +1282,32 @@ struct test_pool2d : public test_case {
1267
  }
1268
  };
1269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1270
  // GGML_OP_IM2COL
1271
  struct test_im2col : public test_case {
1272
  const ggml_type type_input;
@@ -1280,7 +1321,7 @@ struct test_im2col : public test_case {
1280
  // padding
1281
  const int p0;
1282
  const int p1;
1283
- // dilatation
1284
  const int d0;
1285
  const int d1;
1286
  // mode
@@ -1393,7 +1434,7 @@ struct test_argsort : public test_case {
1393
  ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
1394
  }
1395
  } else {
1396
- GGML_ASSERT(false);
1397
  }
1398
  }
1399
  }
@@ -1470,6 +1511,7 @@ struct test_group_norm : public test_case {
1470
  const ggml_type type;
1471
  const std::array<int64_t, 4> ne;
1472
  const int32_t num_groups;
 
1473
 
1474
  std::string vars() override {
1475
  return VARS_TO_STR3(type, ne, num_groups);
@@ -1477,12 +1519,13 @@ struct test_group_norm : public test_case {
1477
 
1478
  test_group_norm(ggml_type type = GGML_TYPE_F32,
1479
  std::array<int64_t, 4> ne = {64, 64, 320, 1},
1480
- int32_t num_groups = 32)
1481
- : type(type), ne(ne), num_groups(num_groups) {}
 
1482
 
1483
  ggml_tensor * build_graph(ggml_context * ctx) override {
1484
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1485
- ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
1486
  return out;
1487
  }
1488
  };
@@ -2053,6 +2096,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2053
  GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
2054
  GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
2055
  GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
 
2056
  };
2057
 
2058
  // unary ops
@@ -2097,6 +2141,19 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2097
 
2098
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
2099
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
 
 
 
 
 
 
 
 
 
 
 
 
 
2100
 
2101
  test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
2102
  test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
@@ -2110,12 +2167,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2110
  test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
2111
  test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
2112
  test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
 
 
 
 
2113
  test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
2114
  test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
2115
 
2116
  for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2117
  for (ggml_type type_dst : all_types) {
2118
  test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
 
 
 
 
 
 
2119
  }
2120
  }
2121
 
@@ -2165,6 +2232,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2165
  test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
2166
  }
2167
 
 
2168
  for (ggml_type type_a : base_types) {
2169
  for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2170
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
@@ -2184,10 +2252,31 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2184
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
2185
  }
2186
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2187
 
2188
  for (ggml_type type_a : other_types) {
2189
  for (ggml_type type_b : {GGML_TYPE_F32}) {
2190
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
 
 
 
2191
  }
2192
  }
2193
 
@@ -2247,7 +2336,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2247
  for (int n = 0; n < 10; ++n) {
2248
  int64_t ne0 = dist_ne0(rng);
2249
  int64_t ne1 = dist_ne1(rng);
2250
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
2251
  }
2252
 
2253
  exponent <<= 1;
@@ -2266,7 +2355,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2266
  }
2267
  }
2268
  }
2269
-
2270
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
2271
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
2272
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
@@ -2380,7 +2469,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2380
  return true;
2381
  }
2382
 
2383
- GGML_ASSERT(false);
2384
  return false;
2385
  }
2386
 
 
1
  #include <ggml.h>
2
  #include <ggml-alloc.h>
3
  #include <ggml-backend.h>
 
4
 
5
  #include <algorithm>
6
  #include <array>
 
79
  im = nullptr;
80
  }
81
  }
82
+
83
  ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
84
  GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
85
+ // TODO: other cases
86
+ //#pragma omp parallel for
87
+ //for (int i = 0; i < tensor->ne[1]; i++) {
88
+ // ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
89
+ // i * tensor->ne[0], 1, tensor->ne[0], im);
90
+ //}
91
+
92
  ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
93
  } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
94
  // This is going to create some weird integers though.
95
  ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
96
  } else {
97
+ GGML_ABORT("fatal error");
98
  }
99
  }
100
 
 
132
  tt.to_float(&buf[i], vq.data(), bs);
133
  tv.insert(tv.end(), vq.begin(), vq.end());
134
  } else {
135
+ GGML_ABORT("fatal error");
136
  }
137
  }
138
  }
 
767
  }
768
 
769
  test_dup(ggml_type type = GGML_TYPE_F32,
770
+ std::array<int64_t, 4> ne = {10, 10, 20, 1},
771
  std::array<int64_t, 4> permute = {0, 0, 0, 0})
772
  : type(type), ne(ne), permute(permute),
773
  _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
 
787
  const ggml_type type_src;
788
  const ggml_type type_dst;
789
  const std::array<int64_t, 4> ne;
790
+ const std::array<int64_t, 4> permute;
791
+ bool _src_use_permute;
792
 
793
  std::string vars() override {
794
+ return VARS_TO_STR4(type_src, type_dst, ne, permute);
795
  }
796
 
797
  double max_nmse_err() override {
 
803
  }
804
 
805
  test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
806
+ std::array<int64_t, 4> ne = {10, 10, 10, 1},
807
+ std::array<int64_t, 4> permute = {0, 0, 0, 0})
808
+ : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
809
+ _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
810
 
811
  ggml_tensor * build_graph(ggml_context * ctx) override {
812
  ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
813
+ if (_src_use_permute) {
814
+ src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
815
+ }
816
+ ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
817
  ggml_tensor * out = ggml_cpy(ctx, src, dst);
818
  return out;
819
  }
 
1189
  }
1190
  };
1191
 
1192
+
1193
  // GGML_OP_ROPE
1194
  struct test_rope : public test_case {
1195
  const ggml_type type;
 
1282
  }
1283
  };
1284
 
1285
+ // GGML_OP_CONV_TRANSPOSE_1D
1286
+ struct test_conv_transpose_1d : public test_case {
1287
+ const std::array<int64_t, 4> ne_input;
1288
+ const std::array<int64_t, 4> ne_kernel;
1289
+
1290
+ const int s0; // stride
1291
+ const int p0; // padding
1292
+ const int d0; // dilation
1293
+
1294
+ std::string vars() override {
1295
+ return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
1296
+ }
1297
+
1298
+ test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
1299
+ std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
1300
+ int s0 = 1, int p0 = 0, int d0 = 1)
1301
+ : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
1302
+
1303
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1304
+ ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
1305
+ ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
1306
+ ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
1307
+ return out;
1308
+ }
1309
+ };
1310
+
1311
  // GGML_OP_IM2COL
1312
  struct test_im2col : public test_case {
1313
  const ggml_type type_input;
 
1321
  // padding
1322
  const int p0;
1323
  const int p1;
1324
+ // dilation
1325
  const int d0;
1326
  const int d1;
1327
  // mode
 
1434
  ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
1435
  }
1436
  } else {
1437
+ GGML_ABORT("fatal error");
1438
  }
1439
  }
1440
  }
 
1511
  const ggml_type type;
1512
  const std::array<int64_t, 4> ne;
1513
  const int32_t num_groups;
1514
+ const float eps;
1515
 
1516
  std::string vars() override {
1517
  return VARS_TO_STR3(type, ne, num_groups);
 
1519
 
1520
  test_group_norm(ggml_type type = GGML_TYPE_F32,
1521
  std::array<int64_t, 4> ne = {64, 64, 320, 1},
1522
+ int32_t num_groups = 32,
1523
+ float eps = 1e-6f)
1524
+ : type(type), ne(ne), num_groups(num_groups), eps(eps) {}
1525
 
1526
  ggml_tensor * build_graph(ggml_context * ctx) override {
1527
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1528
+ ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
1529
  return out;
1530
  }
1531
  };
 
2096
  GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
2097
  GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
2098
  GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
2099
+ GGML_TYPE_BF16,
2100
  };
2101
 
2102
  // unary ops
 
2141
 
2142
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
2143
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
2144
+ // test cases for 1D im2col
2145
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
2146
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
2147
+
2148
+ test_cases.emplace_back(new test_conv_transpose_1d());
2149
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
2150
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
2151
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
2152
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
2153
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
2154
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
2155
+ test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
2156
+
2157
 
2158
  test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
2159
  test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
 
2167
  test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
2168
  test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
2169
  test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
2170
+ test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
2171
+ test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
2172
+ test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
2173
+ test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
2174
  test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
2175
  test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
2176
 
2177
  for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2178
  for (ggml_type type_dst : all_types) {
2179
  test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
2180
+ test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
2181
+ }
2182
+ }
2183
+ for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2184
+ for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2185
+ test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
2186
  }
2187
  }
2188
 
 
2232
  test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
2233
  }
2234
 
2235
+ #if 1
2236
  for (ggml_type type_a : base_types) {
2237
  for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2238
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
 
2252
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
2253
  }
2254
  }
2255
+ #else
2256
+ // m = a rows
2257
+ // n = b rows
2258
+ // k = cols
2259
+ std::uniform_int_distribution<> dist_m(1, 128);
2260
+ std::uniform_int_distribution<> dist_n(16, 128);
2261
+ std::uniform_int_distribution<> dist_k(1, 16);
2262
+ for (int i = 0; i < 1000; i++) {
2263
+ for (ggml_type type_a : all_types) {
2264
+ for (ggml_type type_b : {GGML_TYPE_F32}) {
2265
+ int m = dist_m(rng);
2266
+ int n = dist_n(rng);
2267
+ int k = dist_k(rng) * ggml_blck_size(type_a);
2268
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
2269
+ }
2270
+ }
2271
+ }
2272
+ #endif
2273
 
2274
  for (ggml_type type_a : other_types) {
2275
  for (ggml_type type_b : {GGML_TYPE_F32}) {
2276
+ if (ggml_blck_size(type_a) != 256) {
2277
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1}));
2278
+ }
2279
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
2280
  }
2281
  }
2282
 
 
2336
  for (int n = 0; n < 10; ++n) {
2337
  int64_t ne0 = dist_ne0(rng);
2338
  int64_t ne1 = dist_ne1(rng);
2339
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
2340
  }
2341
 
2342
  exponent <<= 1;
 
2355
  }
2356
  }
2357
  }
2358
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
2359
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
2360
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
2361
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
 
2469
  return true;
2470
  }
2471
 
2472
+ GGML_ABORT("fatal error");
2473
  return false;
2474
  }
2475