Spaces:
Sleeping
Sleeping
CUDA: add mean operation (llama/14313)
Browse files* CUDA: add mean operation
* add back sum_rows_f32_cuda
* Review: early exit if col!=0
- ggml/src/ggml-cuda/common.cuh +20 -0
- ggml/src/ggml-cuda/ggml-cuda.cu +5 -0
- ggml/src/ggml-cuda/mean.cu +19 -0
- ggml/src/ggml-cuda/mean.cuh +3 -0
- ggml/src/ggml-cuda/sumrows.cu +5 -18
- ggml/src/ggml-cuda/sumrows.cuh +0 -1
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -362,6 +362,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
| 362 |
#endif // FP16_AVAILABLE
|
| 363 |
}
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
template<int width = WARP_SIZE>
|
| 366 |
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
| 367 |
#pragma unroll
|
|
|
|
| 362 |
#endif // FP16_AVAILABLE
|
| 363 |
}
|
| 364 |
|
| 365 |
+
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
|
| 366 |
+
template<bool norm>
|
| 367 |
+
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
|
| 368 |
+
const int row = blockIdx.x;
|
| 369 |
+
const int col = threadIdx.x;
|
| 370 |
+
|
| 371 |
+
float sum = 0.0f;
|
| 372 |
+
for (int i = col; i < ncols; i += blockDim.x) {
|
| 373 |
+
sum += x[row * ncols + i];
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
sum = warp_reduce_sum(sum);
|
| 377 |
+
|
| 378 |
+
if (col != 0) {
|
| 379 |
+
return;
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
dst[row] = norm ? sum / ncols : sum;
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
template<int width = WARP_SIZE>
|
| 386 |
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
| 387 |
#pragma unroll
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -37,6 +37,7 @@
|
|
| 37 |
#include "ggml-cuda/ssm-scan.cuh"
|
| 38 |
#include "ggml-cuda/sum.cuh"
|
| 39 |
#include "ggml-cuda/sumrows.cuh"
|
|
|
|
| 40 |
#include "ggml-cuda/tsembd.cuh"
|
| 41 |
#include "ggml-cuda/unary.cuh"
|
| 42 |
#include "ggml-cuda/upscale.cuh"
|
|
@@ -2357,6 +2358,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
| 2357 |
case GGML_OP_SUM_ROWS:
|
| 2358 |
ggml_cuda_op_sum_rows(ctx, dst);
|
| 2359 |
break;
|
|
|
|
|
|
|
|
|
|
| 2360 |
case GGML_OP_SSM_CONV:
|
| 2361 |
ggml_cuda_op_ssm_conv(ctx, dst);
|
| 2362 |
break;
|
|
@@ -3260,6 +3264,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3260 |
case GGML_OP_POOL_2D:
|
| 3261 |
case GGML_OP_SUM:
|
| 3262 |
case GGML_OP_SUM_ROWS:
|
|
|
|
| 3263 |
case GGML_OP_ARGSORT:
|
| 3264 |
case GGML_OP_ACC:
|
| 3265 |
return true;
|
|
|
|
| 37 |
#include "ggml-cuda/ssm-scan.cuh"
|
| 38 |
#include "ggml-cuda/sum.cuh"
|
| 39 |
#include "ggml-cuda/sumrows.cuh"
|
| 40 |
+
#include "ggml-cuda/mean.cuh"
|
| 41 |
#include "ggml-cuda/tsembd.cuh"
|
| 42 |
#include "ggml-cuda/unary.cuh"
|
| 43 |
#include "ggml-cuda/upscale.cuh"
|
|
|
|
| 2358 |
case GGML_OP_SUM_ROWS:
|
| 2359 |
ggml_cuda_op_sum_rows(ctx, dst);
|
| 2360 |
break;
|
| 2361 |
+
case GGML_OP_MEAN:
|
| 2362 |
+
ggml_cuda_op_mean(ctx, dst);
|
| 2363 |
+
break;
|
| 2364 |
case GGML_OP_SSM_CONV:
|
| 2365 |
ggml_cuda_op_ssm_conv(ctx, dst);
|
| 2366 |
break;
|
|
|
|
| 3264 |
case GGML_OP_POOL_2D:
|
| 3265 |
case GGML_OP_SUM:
|
| 3266 |
case GGML_OP_SUM_ROWS:
|
| 3267 |
+
case GGML_OP_MEAN:
|
| 3268 |
case GGML_OP_ARGSORT:
|
| 3269 |
case GGML_OP_ACC:
|
| 3270 |
return true;
|
ggml/src/ggml-cuda/mean.cu
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "mean.cuh"
|
| 2 |
+
|
| 3 |
+
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 4 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 5 |
+
const float * src0_d = (const float *) src0->data;
|
| 6 |
+
float * dst_d = (float *) dst->data;
|
| 7 |
+
cudaStream_t stream = ctx.stream();
|
| 8 |
+
|
| 9 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 10 |
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
| 11 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 12 |
+
|
| 13 |
+
const int64_t ncols = src0->ne[0];
|
| 14 |
+
const int64_t nrows = ggml_nrows(src0);
|
| 15 |
+
|
| 16 |
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
| 17 |
+
const dim3 block_nums(nrows, 1, 1);
|
| 18 |
+
reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
| 19 |
+
}
|
ggml/src/ggml-cuda/mean.cuh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
ggml/src/ggml-cuda/sumrows.cu
CHANGED
|
@@ -1,25 +1,9 @@
|
|
| 1 |
#include "sumrows.cuh"
|
| 2 |
|
| 3 |
-
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
| 4 |
-
const int row = blockIdx.x;
|
| 5 |
-
const int col = threadIdx.x;
|
| 6 |
-
|
| 7 |
-
float sum = 0.0f;
|
| 8 |
-
for (int i = col; i < ncols; i += blockDim.x) {
|
| 9 |
-
sum += x[row * ncols + i];
|
| 10 |
-
}
|
| 11 |
-
|
| 12 |
-
sum = warp_reduce_sum(sum);
|
| 13 |
-
|
| 14 |
-
if (col == 0) {
|
| 15 |
-
dst[row] = sum;
|
| 16 |
-
}
|
| 17 |
-
}
|
| 18 |
-
|
| 19 |
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 20 |
const dim3 block_dims(WARP_SIZE, 1, 1);
|
| 21 |
const dim3 block_nums(nrows, 1, 1);
|
| 22 |
-
|
| 23 |
}
|
| 24 |
|
| 25 |
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
@@ -35,5 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 35 |
const int64_t ncols = src0->ne[0];
|
| 36 |
const int64_t nrows = ggml_nrows(src0);
|
| 37 |
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
| 39 |
}
|
|
|
|
| 1 |
#include "sumrows.cuh"
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 4 |
const dim3 block_dims(WARP_SIZE, 1, 1);
|
| 5 |
const dim3 block_nums(nrows, 1, 1);
|
| 6 |
+
reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
| 7 |
}
|
| 8 |
|
| 9 |
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
|
|
| 19 |
const int64_t ncols = src0->ne[0];
|
| 20 |
const int64_t nrows = ggml_nrows(src0);
|
| 21 |
|
| 22 |
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
| 23 |
+
const dim3 block_nums(nrows, 1, 1);
|
| 24 |
+
|
| 25 |
+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
| 26 |
}
|
ggml/src/ggml-cuda/sumrows.cuh
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
#include "common.cuh"
|
| 2 |
|
| 3 |
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
|
| 4 |
-
|
| 5 |
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
|
|
| 1 |
#include "common.cuh"
|
| 2 |
|
| 3 |
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
|
|
|
|
| 4 |
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|