Spaces:
Running
Running
ggml : Depthwise 2D convolution (ggml/1152)
Browse files* ggml-cpu : kernels for faster depthwise 2D convolution
* fix compile: remove static after moving to ops.cpp
* add dilation for depthwise_conv_2d
* review: rename to ggml_conv_2d_dw_direct, remove redundant struct keywords, pass by ref, whitespace
* review: rename depthwise_conv_2d -> conv_2d_dw everywhere
- ggml/include/ggml.h +21 -1
- ggml/src/ggml-cpu/ggml-cpu.c +5 -0
- ggml/src/ggml-cpu/ops.cpp +172 -0
- ggml/src/ggml-cpu/ops.h +1 -0
- ggml/src/ggml.c +51 -2
ggml/include/ggml.h
CHANGED
|
@@ -481,6 +481,7 @@ extern "C" {
|
|
| 481 |
GGML_OP_CONV_TRANSPOSE_1D,
|
| 482 |
GGML_OP_IM2COL,
|
| 483 |
GGML_OP_IM2COL_BACK,
|
|
|
|
| 484 |
GGML_OP_CONV_TRANSPOSE_2D,
|
| 485 |
GGML_OP_POOL_1D,
|
| 486 |
GGML_OP_POOL_2D,
|
|
@@ -677,6 +678,9 @@ extern "C" {
|
|
| 677 |
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
| 678 |
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
| 679 |
|
|
|
|
|
|
|
|
|
|
| 680 |
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
| 681 |
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
| 682 |
|
|
@@ -1660,7 +1664,7 @@ extern "C" {
|
|
| 1660 |
struct ggml_tensor * a,
|
| 1661 |
struct ggml_tensor * b);
|
| 1662 |
|
| 1663 |
-
// depthwise
|
| 1664 |
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
|
| 1665 |
struct ggml_context * ctx,
|
| 1666 |
struct ggml_tensor * a, // convolution kernel
|
|
@@ -1672,6 +1676,22 @@ extern "C" {
|
|
| 1672 |
int d0, // dilation dimension 0
|
| 1673 |
int d1); // dilation dimension 1
|
| 1674 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1675 |
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
| 1676 |
struct ggml_context * ctx,
|
| 1677 |
struct ggml_tensor * a,
|
|
|
|
| 481 |
GGML_OP_CONV_TRANSPOSE_1D,
|
| 482 |
GGML_OP_IM2COL,
|
| 483 |
GGML_OP_IM2COL_BACK,
|
| 484 |
+
GGML_OP_CONV_2D_DW,
|
| 485 |
GGML_OP_CONV_TRANSPOSE_2D,
|
| 486 |
GGML_OP_POOL_1D,
|
| 487 |
GGML_OP_POOL_2D,
|
|
|
|
| 678 |
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
| 679 |
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
| 680 |
|
| 681 |
+
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
| 682 |
+
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
| 683 |
+
|
| 684 |
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
| 685 |
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
| 686 |
|
|
|
|
| 1664 |
struct ggml_tensor * a,
|
| 1665 |
struct ggml_tensor * b);
|
| 1666 |
|
| 1667 |
+
// depthwise (via im2col and mul_mat)
|
| 1668 |
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
|
| 1669 |
struct ggml_context * ctx,
|
| 1670 |
struct ggml_tensor * a, // convolution kernel
|
|
|
|
| 1676 |
int d0, // dilation dimension 0
|
| 1677 |
int d1); // dilation dimension 1
|
| 1678 |
|
| 1679 |
+
// Depthwise 2D convolution
|
| 1680 |
+
// may be faster than ggml_conv_2d_dw, but not available in all backends
|
| 1681 |
+
// a: KW KH 1 C convolution kernel
|
| 1682 |
+
// b: W H C N input data
|
| 1683 |
+
// res: W_out H_out C N
|
| 1684 |
+
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
|
| 1685 |
+
struct ggml_context * ctx,
|
| 1686 |
+
struct ggml_tensor * a,
|
| 1687 |
+
struct ggml_tensor * b,
|
| 1688 |
+
int stride0,
|
| 1689 |
+
int stride1,
|
| 1690 |
+
int pad0,
|
| 1691 |
+
int pad1,
|
| 1692 |
+
int dilation0,
|
| 1693 |
+
int dilation1);
|
| 1694 |
+
|
| 1695 |
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
| 1696 |
struct ggml_context * ctx,
|
| 1697 |
struct ggml_tensor * a,
|
ggml/src/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -1932,6 +1932,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
| 1932 |
{
|
| 1933 |
ggml_compute_forward_im2col_back_f32(params, tensor);
|
| 1934 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1935 |
case GGML_OP_CONV_TRANSPOSE_2D:
|
| 1936 |
{
|
| 1937 |
ggml_compute_forward_conv_transpose_2d(params, tensor);
|
|
@@ -2268,6 +2272,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
| 2268 |
} break;
|
| 2269 |
case GGML_OP_IM2COL:
|
| 2270 |
case GGML_OP_IM2COL_BACK:
|
|
|
|
| 2271 |
case GGML_OP_CONV_TRANSPOSE_1D:
|
| 2272 |
case GGML_OP_CONV_TRANSPOSE_2D:
|
| 2273 |
{
|
|
|
|
| 1932 |
{
|
| 1933 |
ggml_compute_forward_im2col_back_f32(params, tensor);
|
| 1934 |
} break;
|
| 1935 |
+
case GGML_OP_CONV_2D_DW:
|
| 1936 |
+
{
|
| 1937 |
+
ggml_compute_forward_conv_2d_dw(params, tensor);
|
| 1938 |
+
} break;
|
| 1939 |
case GGML_OP_CONV_TRANSPOSE_2D:
|
| 1940 |
{
|
| 1941 |
ggml_compute_forward_conv_transpose_2d(params, tensor);
|
|
|
|
| 2272 |
} break;
|
| 2273 |
case GGML_OP_IM2COL:
|
| 2274 |
case GGML_OP_IM2COL_BACK:
|
| 2275 |
+
case GGML_OP_CONV_2D_DW:
|
| 2276 |
case GGML_OP_CONV_TRANSPOSE_1D:
|
| 2277 |
case GGML_OP_CONV_TRANSPOSE_2D:
|
| 2278 |
{
|
ggml/src/ggml-cpu/ops.cpp
CHANGED
|
@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
|
|
| 6064 |
}
|
| 6065 |
}
|
| 6066 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6067 |
// ggml_compute_forward_pool_1d_sk_p0
|
| 6068 |
|
| 6069 |
static void ggml_compute_forward_pool_1d_sk_p0(
|
|
|
|
| 6064 |
}
|
| 6065 |
}
|
| 6066 |
|
| 6067 |
+
// ggml_compute_forward_conv_2d_dw
|
| 6068 |
+
|
| 6069 |
+
struct ggml_conv_2d_dw_params {
|
| 6070 |
+
int64_t channels;
|
| 6071 |
+
int64_t batch;
|
| 6072 |
+
int64_t src_w;
|
| 6073 |
+
int64_t src_h;
|
| 6074 |
+
int64_t dst_w;
|
| 6075 |
+
int64_t dst_h;
|
| 6076 |
+
int64_t knl_w;
|
| 6077 |
+
int64_t knl_h;
|
| 6078 |
+
int stride_x;
|
| 6079 |
+
int stride_y;
|
| 6080 |
+
int pad_x;
|
| 6081 |
+
int pad_y;
|
| 6082 |
+
int dilation_x;
|
| 6083 |
+
int dilation_y;
|
| 6084 |
+
};
|
| 6085 |
+
|
| 6086 |
+
static void ggml_compute_forward_conv_2d_dw_cwhn(
|
| 6087 |
+
const ggml_compute_params * params,
|
| 6088 |
+
const ggml_tensor * src,
|
| 6089 |
+
const ggml_tensor * kernel,
|
| 6090 |
+
ggml_tensor * dst,
|
| 6091 |
+
const ggml_conv_2d_dw_params & p) {
|
| 6092 |
+
|
| 6093 |
+
const int64_t c = p.channels;
|
| 6094 |
+
const float * knl_data = (const float *)kernel->data;
|
| 6095 |
+
|
| 6096 |
+
const int64_t rows_total = p.dst_h * p.batch;
|
| 6097 |
+
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
|
| 6098 |
+
const int64_t row_start = params->ith * rows_per_thread;
|
| 6099 |
+
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
|
| 6100 |
+
|
| 6101 |
+
#ifdef GGML_SIMD
|
| 6102 |
+
const int64_t pkg_size = GGML_F32_EPR;
|
| 6103 |
+
const int64_t pkg_count = c / pkg_size;
|
| 6104 |
+
const int64_t c_pkg_end = pkg_count * pkg_size;
|
| 6105 |
+
#else
|
| 6106 |
+
const int64_t c_pkg_end = 0;
|
| 6107 |
+
#endif
|
| 6108 |
+
|
| 6109 |
+
for (int64_t row = row_start; row < row_end; ++row) {
|
| 6110 |
+
const int64_t dst_y = row % p.dst_h;
|
| 6111 |
+
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
|
| 6112 |
+
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
|
| 6113 |
+
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
|
| 6114 |
+
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
|
| 6115 |
+
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
|
| 6116 |
+
|
| 6117 |
+
#ifdef GGML_SIMD
|
| 6118 |
+
// Vectorized loop
|
| 6119 |
+
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
|
| 6120 |
+
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
|
| 6121 |
+
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
| 6122 |
+
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
|
| 6123 |
+
if (src_y < 0 || src_y >= p.src_h) {
|
| 6124 |
+
continue;
|
| 6125 |
+
}
|
| 6126 |
+
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
| 6127 |
+
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
|
| 6128 |
+
if (src_x < 0 || src_x >= p.src_w) {
|
| 6129 |
+
continue;
|
| 6130 |
+
}
|
| 6131 |
+
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
|
| 6132 |
+
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
|
| 6133 |
+
sum = GGML_F32_VEC_FMA(sum, k, s);
|
| 6134 |
+
}
|
| 6135 |
+
}
|
| 6136 |
+
GGML_F32_VEC_STORE(dst_data + c_i, sum);
|
| 6137 |
+
}
|
| 6138 |
+
#endif
|
| 6139 |
+
// Scalar loop
|
| 6140 |
+
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
|
| 6141 |
+
float sum = 0.0f;
|
| 6142 |
+
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
| 6143 |
+
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
|
| 6144 |
+
if (src_y < 0 || src_y >= p.src_h) {
|
| 6145 |
+
continue;
|
| 6146 |
+
}
|
| 6147 |
+
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
| 6148 |
+
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
|
| 6149 |
+
if (src_x < 0 || src_x >= p.src_w) {
|
| 6150 |
+
continue;
|
| 6151 |
+
}
|
| 6152 |
+
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
|
| 6153 |
+
* src_data[(src_y * p.src_w + src_x) * c + c_i];
|
| 6154 |
+
}
|
| 6155 |
+
}
|
| 6156 |
+
dst_data[c_i] = sum;
|
| 6157 |
+
}
|
| 6158 |
+
}
|
| 6159 |
+
}
|
| 6160 |
+
}
|
| 6161 |
+
|
| 6162 |
+
static void ggml_compute_forward_conv_2d_dw_whcn(
|
| 6163 |
+
const ggml_compute_params * params,
|
| 6164 |
+
const ggml_tensor * src,
|
| 6165 |
+
const ggml_tensor * kernel,
|
| 6166 |
+
ggml_tensor * dst,
|
| 6167 |
+
const ggml_conv_2d_dw_params & p) {
|
| 6168 |
+
|
| 6169 |
+
const int64_t n = p.channels * p.batch;
|
| 6170 |
+
const int64_t per_thread = (n + params->nth - 1) / params->nth;
|
| 6171 |
+
const int64_t start = params->ith * per_thread;
|
| 6172 |
+
const int64_t end = MIN(start + per_thread, n);
|
| 6173 |
+
|
| 6174 |
+
for (int64_t i = start; i < end; ++i) {
|
| 6175 |
+
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
|
| 6176 |
+
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
|
| 6177 |
+
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
|
| 6178 |
+
|
| 6179 |
+
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
|
| 6180 |
+
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
|
| 6181 |
+
|
| 6182 |
+
float sum = 0.0f;
|
| 6183 |
+
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
|
| 6184 |
+
const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
|
| 6185 |
+
if (src_y < 0 || src_y >= p.src_h) {
|
| 6186 |
+
continue;
|
| 6187 |
+
}
|
| 6188 |
+
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
|
| 6189 |
+
const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
|
| 6190 |
+
if (src_x < 0 || src_x >= p.src_w) {
|
| 6191 |
+
continue;
|
| 6192 |
+
}
|
| 6193 |
+
sum += knl_data[knl_y * p.knl_w + knl_x]
|
| 6194 |
+
* src_data[src_y * p.src_w + src_x];
|
| 6195 |
+
}
|
| 6196 |
+
}
|
| 6197 |
+
dst_data[dst_y * p.dst_w + dst_x] = sum;
|
| 6198 |
+
}
|
| 6199 |
+
}
|
| 6200 |
+
}
|
| 6201 |
+
}
|
| 6202 |
+
|
| 6203 |
+
void ggml_compute_forward_conv_2d_dw(
|
| 6204 |
+
const ggml_compute_params * params,
|
| 6205 |
+
ggml_tensor * dst) {
|
| 6206 |
+
|
| 6207 |
+
const ggml_tensor * kernel = dst->src[0];
|
| 6208 |
+
const ggml_tensor * src = dst->src[1];
|
| 6209 |
+
ggml_conv_2d_dw_params p;
|
| 6210 |
+
p.channels = src->ne[2];
|
| 6211 |
+
p.batch = src->ne[3];
|
| 6212 |
+
p.src_w = src->ne[0];
|
| 6213 |
+
p.src_h = src->ne[1];
|
| 6214 |
+
p.dst_w = dst->ne[0];
|
| 6215 |
+
p.dst_h = dst->ne[1];
|
| 6216 |
+
p.knl_w = kernel->ne[0];
|
| 6217 |
+
p.knl_h = kernel->ne[1];
|
| 6218 |
+
p.stride_x = dst->op_params[0];
|
| 6219 |
+
p.stride_y = dst->op_params[1];
|
| 6220 |
+
p.pad_x = dst->op_params[2];
|
| 6221 |
+
p.pad_y = dst->op_params[3];
|
| 6222 |
+
p.dilation_x = dst->op_params[4];
|
| 6223 |
+
p.dilation_y = dst->op_params[5];
|
| 6224 |
+
|
| 6225 |
+
GGML_ASSERT(kernel->ne[3] == p.channels);
|
| 6226 |
+
GGML_ASSERT(dst->ne[3] == p.batch);
|
| 6227 |
+
|
| 6228 |
+
if (ggml_is_contiguous(src)) {
|
| 6229 |
+
ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
|
| 6230 |
+
} else if (ggml_is_contiguous_channels(src)) {
|
| 6231 |
+
// kernel should also have channels most contiguous in memory
|
| 6232 |
+
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
|
| 6233 |
+
ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
|
| 6234 |
+
} else {
|
| 6235 |
+
GGML_ABORT("non-contiguous memory layout not supported");
|
| 6236 |
+
}
|
| 6237 |
+
}
|
| 6238 |
+
|
| 6239 |
// ggml_compute_forward_pool_1d_sk_p0
|
| 6240 |
|
| 6241 |
static void ggml_compute_forward_pool_1d_sk_p0(
|
ggml/src/ggml-cpu/ops.h
CHANGED
|
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
|
|
| 65 |
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 66 |
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 67 |
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
|
|
| 68 |
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 69 |
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 70 |
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
|
|
| 65 |
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 66 |
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 67 |
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 68 |
+
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 69 |
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 70 |
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 71 |
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
ggml/src/ggml.c
CHANGED
|
@@ -956,6 +956,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
| 956 |
"CONV_TRANSPOSE_1D",
|
| 957 |
"IM2COL",
|
| 958 |
"IM2COL_BACK",
|
|
|
|
| 959 |
"CONV_TRANSPOSE_2D",
|
| 960 |
"POOL_1D",
|
| 961 |
"POOL_2D",
|
|
@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
| 993 |
"OPT_STEP_ADAMW",
|
| 994 |
};
|
| 995 |
|
| 996 |
-
static_assert(GGML_OP_COUNT ==
|
| 997 |
|
| 998 |
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
| 999 |
"none",
|
|
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
| 1050 |
"conv_transpose_1d(x)",
|
| 1051 |
"im2col(x)",
|
| 1052 |
"im2col_back(x)",
|
|
|
|
| 1053 |
"conv_transpose_2d(x)",
|
| 1054 |
"pool_1d(x)",
|
| 1055 |
"pool_2d(x)",
|
|
@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
| 1087 |
"adamw(x)",
|
| 1088 |
};
|
| 1089 |
|
| 1090 |
-
static_assert(GGML_OP_COUNT ==
|
| 1091 |
|
| 1092 |
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
| 1093 |
|
|
@@ -1344,6 +1346,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
|
| 1344 |
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
| 1345 |
}
|
| 1346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1347 |
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
| 1348 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1349 |
|
|
@@ -4050,6 +4059,46 @@ struct ggml_tensor * ggml_conv_2d_dw(
|
|
| 4050 |
return result;
|
| 4051 |
}
|
| 4052 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4053 |
// ggml_conv_transpose_2d_p0
|
| 4054 |
|
| 4055 |
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
|
|
|
| 956 |
"CONV_TRANSPOSE_1D",
|
| 957 |
"IM2COL",
|
| 958 |
"IM2COL_BACK",
|
| 959 |
+
"CONV_2D_DW",
|
| 960 |
"CONV_TRANSPOSE_2D",
|
| 961 |
"POOL_1D",
|
| 962 |
"POOL_2D",
|
|
|
|
| 994 |
"OPT_STEP_ADAMW",
|
| 995 |
};
|
| 996 |
|
| 997 |
+
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
| 998 |
|
| 999 |
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
| 1000 |
"none",
|
|
|
|
| 1051 |
"conv_transpose_1d(x)",
|
| 1052 |
"im2col(x)",
|
| 1053 |
"im2col_back(x)",
|
| 1054 |
+
"conv_2d_dw(x)",
|
| 1055 |
"conv_transpose_2d(x)",
|
| 1056 |
"pool_1d(x)",
|
| 1057 |
"pool_2d(x)",
|
|
|
|
| 1089 |
"adamw(x)",
|
| 1090 |
};
|
| 1091 |
|
| 1092 |
+
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
| 1093 |
|
| 1094 |
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
| 1095 |
|
|
|
|
| 1346 |
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
| 1347 |
}
|
| 1348 |
|
| 1349 |
+
bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
|
| 1350 |
+
return
|
| 1351 |
+
tensor->nb[0] > tensor->nb[2] &&
|
| 1352 |
+
tensor->nb[1] > tensor->nb[0] &&
|
| 1353 |
+
tensor->nb[2] == ggml_type_size(tensor->type);
|
| 1354 |
+
}
|
| 1355 |
+
|
| 1356 |
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
| 1357 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1358 |
|
|
|
|
| 4059 |
return result;
|
| 4060 |
}
|
| 4061 |
|
| 4062 |
+
// ggml_conv_2d_dw_direct
|
| 4063 |
+
|
| 4064 |
+
struct ggml_tensor * ggml_conv_2d_dw_direct(
|
| 4065 |
+
struct ggml_context * ctx,
|
| 4066 |
+
struct ggml_tensor * a,
|
| 4067 |
+
struct ggml_tensor * b,
|
| 4068 |
+
int stride0,
|
| 4069 |
+
int stride1,
|
| 4070 |
+
int pad0,
|
| 4071 |
+
int pad1,
|
| 4072 |
+
int dilation0,
|
| 4073 |
+
int dilation1) {
|
| 4074 |
+
GGML_ASSERT(a->ne[2] == 1);
|
| 4075 |
+
GGML_ASSERT(a->ne[3] == b->ne[2]);
|
| 4076 |
+
int64_t ne[4];
|
| 4077 |
+
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0);
|
| 4078 |
+
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1);
|
| 4079 |
+
ne[2] = b->ne[2];
|
| 4080 |
+
ne[3] = b->ne[3];
|
| 4081 |
+
|
| 4082 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
|
| 4083 |
+
|
| 4084 |
+
if (ggml_is_contiguous_channels(b)) {
|
| 4085 |
+
// Result will be permuted the same way as input (CWHN order)
|
| 4086 |
+
const int64_t type_size = ggml_type_size(result->type);
|
| 4087 |
+
GGML_ASSERT(ggml_blck_size(result->type) == 1);
|
| 4088 |
+
result->nb[0] = result->ne[2] * type_size;
|
| 4089 |
+
result->nb[1] = result->ne[0] * result->nb[0];
|
| 4090 |
+
result->nb[2] = type_size;
|
| 4091 |
+
}
|
| 4092 |
+
|
| 4093 |
+
int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
|
| 4094 |
+
ggml_set_op_params(result, params, sizeof(params));
|
| 4095 |
+
|
| 4096 |
+
result->op = GGML_OP_CONV_2D_DW;
|
| 4097 |
+
result->src[0] = a;
|
| 4098 |
+
result->src[1] = b;
|
| 4099 |
+
return result;
|
| 4100 |
+
}
|
| 4101 |
+
|
| 4102 |
// ggml_conv_transpose_2d_p0
|
| 4103 |
|
| 4104 |
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|