Acly commited on
Commit
0c950d5
·
1 Parent(s): acb674d

ggml : Depthwise 2D convolution (ggml/1152)

Browse files

* ggml-cpu : kernels for faster depthwise 2D convolution

* fix compile: remove static after moving to ops.cpp

* add dilation for depthwise_conv_2d

* review: rename to ggml_conv_2d_dw_direct, remove redundant struct keywords, pass by ref, whitespace

* review: rename depthwise_conv_2d -> conv_2d_dw everywhere

ggml/include/ggml.h CHANGED
@@ -481,6 +481,7 @@ extern "C" {
481
  GGML_OP_CONV_TRANSPOSE_1D,
482
  GGML_OP_IM2COL,
483
  GGML_OP_IM2COL_BACK,
 
484
  GGML_OP_CONV_TRANSPOSE_2D,
485
  GGML_OP_POOL_1D,
486
  GGML_OP_POOL_2D,
@@ -677,6 +678,9 @@ extern "C" {
677
  GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
678
  GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
679
 
 
 
 
680
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
681
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
682
 
@@ -1660,7 +1664,7 @@ extern "C" {
1660
  struct ggml_tensor * a,
1661
  struct ggml_tensor * b);
1662
 
1663
- // depthwise
1664
  GGML_API struct ggml_tensor * ggml_conv_2d_dw(
1665
  struct ggml_context * ctx,
1666
  struct ggml_tensor * a, // convolution kernel
@@ -1672,6 +1676,22 @@ extern "C" {
1672
  int d0, // dilation dimension 0
1673
  int d1); // dilation dimension 1
1674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1675
  GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
1676
  struct ggml_context * ctx,
1677
  struct ggml_tensor * a,
 
481
  GGML_OP_CONV_TRANSPOSE_1D,
482
  GGML_OP_IM2COL,
483
  GGML_OP_IM2COL_BACK,
484
+ GGML_OP_CONV_2D_DW,
485
  GGML_OP_CONV_TRANSPOSE_2D,
486
  GGML_OP_POOL_1D,
487
  GGML_OP_POOL_2D,
 
678
  GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
679
  GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
680
 
681
+ // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
682
+ GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
683
+
684
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
685
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
686
 
 
1664
  struct ggml_tensor * a,
1665
  struct ggml_tensor * b);
1666
 
1667
+ // depthwise (via im2col and mul_mat)
1668
  GGML_API struct ggml_tensor * ggml_conv_2d_dw(
1669
  struct ggml_context * ctx,
1670
  struct ggml_tensor * a, // convolution kernel
 
1676
  int d0, // dilation dimension 0
1677
  int d1); // dilation dimension 1
1678
 
1679
+ // Depthwise 2D convolution
1680
+ // may be faster than ggml_conv_2d_dw, but not available in all backends
1681
+ // a: KW KH 1 C convolution kernel
1682
+ // b: W H C N input data
1683
+ // res: W_out H_out C N
1684
+ GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
1685
+ struct ggml_context * ctx,
1686
+ struct ggml_tensor * a,
1687
+ struct ggml_tensor * b,
1688
+ int stride0,
1689
+ int stride1,
1690
+ int pad0,
1691
+ int pad1,
1692
+ int dilation0,
1693
+ int dilation1);
1694
+
1695
  GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
1696
  struct ggml_context * ctx,
1697
  struct ggml_tensor * a,
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -1932,6 +1932,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1932
  {
1933
  ggml_compute_forward_im2col_back_f32(params, tensor);
1934
  } break;
 
 
 
 
1935
  case GGML_OP_CONV_TRANSPOSE_2D:
1936
  {
1937
  ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -2268,6 +2272,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2268
  } break;
2269
  case GGML_OP_IM2COL:
2270
  case GGML_OP_IM2COL_BACK:
 
2271
  case GGML_OP_CONV_TRANSPOSE_1D:
2272
  case GGML_OP_CONV_TRANSPOSE_2D:
2273
  {
 
1932
  {
1933
  ggml_compute_forward_im2col_back_f32(params, tensor);
1934
  } break;
1935
+ case GGML_OP_CONV_2D_DW:
1936
+ {
1937
+ ggml_compute_forward_conv_2d_dw(params, tensor);
1938
+ } break;
1939
  case GGML_OP_CONV_TRANSPOSE_2D:
1940
  {
1941
  ggml_compute_forward_conv_transpose_2d(params, tensor);
 
2272
  } break;
2273
  case GGML_OP_IM2COL:
2274
  case GGML_OP_IM2COL_BACK:
2275
+ case GGML_OP_CONV_2D_DW:
2276
  case GGML_OP_CONV_TRANSPOSE_1D:
2277
  case GGML_OP_CONV_TRANSPOSE_2D:
2278
  {
ggml/src/ggml-cpu/ops.cpp CHANGED
@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
6064
  }
6065
  }
6066
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6067
  // ggml_compute_forward_pool_1d_sk_p0
6068
 
6069
  static void ggml_compute_forward_pool_1d_sk_p0(
 
6064
  }
6065
  }
6066
 
6067
+ // ggml_compute_forward_conv_2d_dw
6068
+
6069
+ struct ggml_conv_2d_dw_params {
6070
+ int64_t channels;
6071
+ int64_t batch;
6072
+ int64_t src_w;
6073
+ int64_t src_h;
6074
+ int64_t dst_w;
6075
+ int64_t dst_h;
6076
+ int64_t knl_w;
6077
+ int64_t knl_h;
6078
+ int stride_x;
6079
+ int stride_y;
6080
+ int pad_x;
6081
+ int pad_y;
6082
+ int dilation_x;
6083
+ int dilation_y;
6084
+ };
6085
+
6086
+ static void ggml_compute_forward_conv_2d_dw_cwhn(
6087
+ const ggml_compute_params * params,
6088
+ const ggml_tensor * src,
6089
+ const ggml_tensor * kernel,
6090
+ ggml_tensor * dst,
6091
+ const ggml_conv_2d_dw_params & p) {
6092
+
6093
+ const int64_t c = p.channels;
6094
+ const float * knl_data = (const float *)kernel->data;
6095
+
6096
+ const int64_t rows_total = p.dst_h * p.batch;
6097
+ const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
6098
+ const int64_t row_start = params->ith * rows_per_thread;
6099
+ const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
6100
+
6101
+ #ifdef GGML_SIMD
6102
+ const int64_t pkg_size = GGML_F32_EPR;
6103
+ const int64_t pkg_count = c / pkg_size;
6104
+ const int64_t c_pkg_end = pkg_count * pkg_size;
6105
+ #else
6106
+ const int64_t c_pkg_end = 0;
6107
+ #endif
6108
+
6109
+ for (int64_t row = row_start; row < row_end; ++row) {
6110
+ const int64_t dst_y = row % p.dst_h;
6111
+ const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
6112
+ for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
6113
+ float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
6114
+ const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
6115
+ const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
6116
+
6117
+ #ifdef GGML_SIMD
6118
+ // Vectorized loop
6119
+ for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
6120
+ GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
6121
+ for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
6122
+ const int64_t src_y = src_y_base + knl_y * p.dilation_y;
6123
+ if (src_y < 0 || src_y >= p.src_h) {
6124
+ continue;
6125
+ }
6126
+ for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
6127
+ const int64_t src_x = src_x_base + knl_x * p.dilation_x;
6128
+ if (src_x < 0 || src_x >= p.src_w) {
6129
+ continue;
6130
+ }
6131
+ GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
6132
+ GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
6133
+ sum = GGML_F32_VEC_FMA(sum, k, s);
6134
+ }
6135
+ }
6136
+ GGML_F32_VEC_STORE(dst_data + c_i, sum);
6137
+ }
6138
+ #endif
6139
+ // Scalar loop
6140
+ for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
6141
+ float sum = 0.0f;
6142
+ for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
6143
+ const int64_t src_y = src_y_base + knl_y * p.dilation_y;
6144
+ if (src_y < 0 || src_y >= p.src_h) {
6145
+ continue;
6146
+ }
6147
+ for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
6148
+ const int64_t src_x = src_x_base + knl_x * p.dilation_x;
6149
+ if (src_x < 0 || src_x >= p.src_w) {
6150
+ continue;
6151
+ }
6152
+ sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
6153
+ * src_data[(src_y * p.src_w + src_x) * c + c_i];
6154
+ }
6155
+ }
6156
+ dst_data[c_i] = sum;
6157
+ }
6158
+ }
6159
+ }
6160
+ }
6161
+
6162
+ static void ggml_compute_forward_conv_2d_dw_whcn(
6163
+ const ggml_compute_params * params,
6164
+ const ggml_tensor * src,
6165
+ const ggml_tensor * kernel,
6166
+ ggml_tensor * dst,
6167
+ const ggml_conv_2d_dw_params & p) {
6168
+
6169
+ const int64_t n = p.channels * p.batch;
6170
+ const int64_t per_thread = (n + params->nth - 1) / params->nth;
6171
+ const int64_t start = params->ith * per_thread;
6172
+ const int64_t end = MIN(start + per_thread, n);
6173
+
6174
+ for (int64_t i = start; i < end; ++i) {
6175
+ const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
6176
+ const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
6177
+ float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
6178
+
6179
+ for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
6180
+ for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
6181
+
6182
+ float sum = 0.0f;
6183
+ for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
6184
+ const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
6185
+ if (src_y < 0 || src_y >= p.src_h) {
6186
+ continue;
6187
+ }
6188
+ for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
6189
+ const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
6190
+ if (src_x < 0 || src_x >= p.src_w) {
6191
+ continue;
6192
+ }
6193
+ sum += knl_data[knl_y * p.knl_w + knl_x]
6194
+ * src_data[src_y * p.src_w + src_x];
6195
+ }
6196
+ }
6197
+ dst_data[dst_y * p.dst_w + dst_x] = sum;
6198
+ }
6199
+ }
6200
+ }
6201
+ }
6202
+
6203
+ void ggml_compute_forward_conv_2d_dw(
6204
+ const ggml_compute_params * params,
6205
+ ggml_tensor * dst) {
6206
+
6207
+ const ggml_tensor * kernel = dst->src[0];
6208
+ const ggml_tensor * src = dst->src[1];
6209
+ ggml_conv_2d_dw_params p;
6210
+ p.channels = src->ne[2];
6211
+ p.batch = src->ne[3];
6212
+ p.src_w = src->ne[0];
6213
+ p.src_h = src->ne[1];
6214
+ p.dst_w = dst->ne[0];
6215
+ p.dst_h = dst->ne[1];
6216
+ p.knl_w = kernel->ne[0];
6217
+ p.knl_h = kernel->ne[1];
6218
+ p.stride_x = dst->op_params[0];
6219
+ p.stride_y = dst->op_params[1];
6220
+ p.pad_x = dst->op_params[2];
6221
+ p.pad_y = dst->op_params[3];
6222
+ p.dilation_x = dst->op_params[4];
6223
+ p.dilation_y = dst->op_params[5];
6224
+
6225
+ GGML_ASSERT(kernel->ne[3] == p.channels);
6226
+ GGML_ASSERT(dst->ne[3] == p.batch);
6227
+
6228
+ if (ggml_is_contiguous(src)) {
6229
+ ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
6230
+ } else if (ggml_is_contiguous_channels(src)) {
6231
+ // kernel should also have channels most contiguous in memory
6232
+ GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
6233
+ ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
6234
+ } else {
6235
+ GGML_ABORT("non-contiguous memory layout not supported");
6236
+ }
6237
+ }
6238
+
6239
  // ggml_compute_forward_pool_1d_sk_p0
6240
 
6241
  static void ggml_compute_forward_pool_1d_sk_p0(
ggml/src/ggml-cpu/ops.h CHANGED
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
65
  void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
66
  void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
67
  void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 
68
  void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
69
  void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
70
  void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 
65
  void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
66
  void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
67
  void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
68
+ void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
69
  void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
70
  void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
71
  void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
ggml/src/ggml.c CHANGED
@@ -956,6 +956,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
956
  "CONV_TRANSPOSE_1D",
957
  "IM2COL",
958
  "IM2COL_BACK",
 
959
  "CONV_TRANSPOSE_2D",
960
  "POOL_1D",
961
  "POOL_2D",
@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
993
  "OPT_STEP_ADAMW",
994
  };
995
 
996
- static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
997
 
998
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
999
  "none",
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1050
  "conv_transpose_1d(x)",
1051
  "im2col(x)",
1052
  "im2col_back(x)",
 
1053
  "conv_transpose_2d(x)",
1054
  "pool_1d(x)",
1055
  "pool_2d(x)",
@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1087
  "adamw(x)",
1088
  };
1089
 
1090
- static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
1091
 
1092
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1093
 
@@ -1344,6 +1346,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
1344
  return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
1345
  }
1346
 
 
 
 
 
 
 
 
1347
  static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
1348
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1349
 
@@ -4050,6 +4059,46 @@ struct ggml_tensor * ggml_conv_2d_dw(
4050
  return result;
4051
  }
4052
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4053
  // ggml_conv_transpose_2d_p0
4054
 
4055
  static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
 
956
  "CONV_TRANSPOSE_1D",
957
  "IM2COL",
958
  "IM2COL_BACK",
959
+ "CONV_2D_DW",
960
  "CONV_TRANSPOSE_2D",
961
  "POOL_1D",
962
  "POOL_2D",
 
994
  "OPT_STEP_ADAMW",
995
  };
996
 
997
+ static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
998
 
999
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1000
  "none",
 
1051
  "conv_transpose_1d(x)",
1052
  "im2col(x)",
1053
  "im2col_back(x)",
1054
+ "conv_2d_dw(x)",
1055
  "conv_transpose_2d(x)",
1056
  "pool_1d(x)",
1057
  "pool_2d(x)",
 
1089
  "adamw(x)",
1090
  };
1091
 
1092
+ static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
1093
 
1094
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1095
 
 
1346
  return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
1347
  }
1348
 
1349
+ bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
1350
+ return
1351
+ tensor->nb[0] > tensor->nb[2] &&
1352
+ tensor->nb[1] > tensor->nb[0] &&
1353
+ tensor->nb[2] == ggml_type_size(tensor->type);
1354
+ }
1355
+
1356
  static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
1357
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1358
 
 
4059
  return result;
4060
  }
4061
 
4062
+ // ggml_conv_2d_dw_direct
4063
+
4064
+ struct ggml_tensor * ggml_conv_2d_dw_direct(
4065
+ struct ggml_context * ctx,
4066
+ struct ggml_tensor * a,
4067
+ struct ggml_tensor * b,
4068
+ int stride0,
4069
+ int stride1,
4070
+ int pad0,
4071
+ int pad1,
4072
+ int dilation0,
4073
+ int dilation1) {
4074
+ GGML_ASSERT(a->ne[2] == 1);
4075
+ GGML_ASSERT(a->ne[3] == b->ne[2]);
4076
+ int64_t ne[4];
4077
+ ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0);
4078
+ ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1);
4079
+ ne[2] = b->ne[2];
4080
+ ne[3] = b->ne[3];
4081
+
4082
+ struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
4083
+
4084
+ if (ggml_is_contiguous_channels(b)) {
4085
+ // Result will be permuted the same way as input (CWHN order)
4086
+ const int64_t type_size = ggml_type_size(result->type);
4087
+ GGML_ASSERT(ggml_blck_size(result->type) == 1);
4088
+ result->nb[0] = result->ne[2] * type_size;
4089
+ result->nb[1] = result->ne[0] * result->nb[0];
4090
+ result->nb[2] = type_size;
4091
+ }
4092
+
4093
+ int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
4094
+ ggml_set_op_params(result, params, sizeof(params));
4095
+
4096
+ result->op = GGML_OP_CONV_2D_DW;
4097
+ result->src[0] = a;
4098
+ result->src[1] = b;
4099
+ return result;
4100
+ }
4101
+
4102
  // ggml_conv_transpose_2d_p0
4103
 
4104
  static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {