am17an commited on
Commit
7cee55b
·
1 Parent(s): 2c3741a

CUDA: add mean operation (llama/14313)

Browse files

* CUDA: add mean operation

* add back sum_rows_f32_cuda

* Review: early exit if col!=0

ggml/src/ggml-cuda/common.cuh CHANGED
@@ -362,6 +362,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
362
  #endif // FP16_AVAILABLE
363
  }
364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  template<int width = WARP_SIZE>
366
  static __device__ __forceinline__ float warp_reduce_max(float x) {
367
  #pragma unroll
 
362
  #endif // FP16_AVAILABLE
363
  }
364
 
365
+ // Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
366
+ template<bool norm>
367
+ static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
368
+ const int row = blockIdx.x;
369
+ const int col = threadIdx.x;
370
+
371
+ float sum = 0.0f;
372
+ for (int i = col; i < ncols; i += blockDim.x) {
373
+ sum += x[row * ncols + i];
374
+ }
375
+
376
+ sum = warp_reduce_sum(sum);
377
+
378
+ if (col != 0) {
379
+ return;
380
+ }
381
+
382
+ dst[row] = norm ? sum / ncols : sum;
383
+ }
384
+
385
  template<int width = WARP_SIZE>
386
  static __device__ __forceinline__ float warp_reduce_max(float x) {
387
  #pragma unroll
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -37,6 +37,7 @@
37
  #include "ggml-cuda/ssm-scan.cuh"
38
  #include "ggml-cuda/sum.cuh"
39
  #include "ggml-cuda/sumrows.cuh"
 
40
  #include "ggml-cuda/tsembd.cuh"
41
  #include "ggml-cuda/unary.cuh"
42
  #include "ggml-cuda/upscale.cuh"
@@ -2357,6 +2358,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2357
  case GGML_OP_SUM_ROWS:
2358
  ggml_cuda_op_sum_rows(ctx, dst);
2359
  break;
 
 
 
2360
  case GGML_OP_SSM_CONV:
2361
  ggml_cuda_op_ssm_conv(ctx, dst);
2362
  break;
@@ -3260,6 +3264,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3260
  case GGML_OP_POOL_2D:
3261
  case GGML_OP_SUM:
3262
  case GGML_OP_SUM_ROWS:
 
3263
  case GGML_OP_ARGSORT:
3264
  case GGML_OP_ACC:
3265
  return true;
 
37
  #include "ggml-cuda/ssm-scan.cuh"
38
  #include "ggml-cuda/sum.cuh"
39
  #include "ggml-cuda/sumrows.cuh"
40
+ #include "ggml-cuda/mean.cuh"
41
  #include "ggml-cuda/tsembd.cuh"
42
  #include "ggml-cuda/unary.cuh"
43
  #include "ggml-cuda/upscale.cuh"
 
2358
  case GGML_OP_SUM_ROWS:
2359
  ggml_cuda_op_sum_rows(ctx, dst);
2360
  break;
2361
+ case GGML_OP_MEAN:
2362
+ ggml_cuda_op_mean(ctx, dst);
2363
+ break;
2364
  case GGML_OP_SSM_CONV:
2365
  ggml_cuda_op_ssm_conv(ctx, dst);
2366
  break;
 
3264
  case GGML_OP_POOL_2D:
3265
  case GGML_OP_SUM:
3266
  case GGML_OP_SUM_ROWS:
3267
+ case GGML_OP_MEAN:
3268
  case GGML_OP_ARGSORT:
3269
  case GGML_OP_ACC:
3270
  return true;
ggml/src/ggml-cuda/mean.cu ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "mean.cuh"
2
+
3
+ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
4
+ const ggml_tensor * src0 = dst->src[0];
5
+ const float * src0_d = (const float *) src0->data;
6
+ float * dst_d = (float *) dst->data;
7
+ cudaStream_t stream = ctx.stream();
8
+
9
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
10
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
11
+ GGML_ASSERT(ggml_is_contiguous(src0));
12
+
13
+ const int64_t ncols = src0->ne[0];
14
+ const int64_t nrows = ggml_nrows(src0);
15
+
16
+ const dim3 block_dims(WARP_SIZE, 1, 1);
17
+ const dim3 block_nums(nrows, 1, 1);
18
+ reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
19
+ }
ggml/src/ggml-cuda/mean.cuh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml/src/ggml-cuda/sumrows.cu CHANGED
@@ -1,25 +1,9 @@
1
  #include "sumrows.cuh"
2
 
3
- static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
4
- const int row = blockIdx.x;
5
- const int col = threadIdx.x;
6
-
7
- float sum = 0.0f;
8
- for (int i = col; i < ncols; i += blockDim.x) {
9
- sum += x[row * ncols + i];
10
- }
11
-
12
- sum = warp_reduce_sum(sum);
13
-
14
- if (col == 0) {
15
- dst[row] = sum;
16
- }
17
- }
18
-
19
  void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
20
  const dim3 block_dims(WARP_SIZE, 1, 1);
21
  const dim3 block_nums(nrows, 1, 1);
22
- k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
23
  }
24
 
25
  void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -35,5 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
35
  const int64_t ncols = src0->ne[0];
36
  const int64_t nrows = ggml_nrows(src0);
37
 
38
- sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
 
 
 
39
  }
 
1
  #include "sumrows.cuh"
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4
  const dim3 block_dims(WARP_SIZE, 1, 1);
5
  const dim3 block_nums(nrows, 1, 1);
6
+ reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7
  }
8
 
9
  void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
19
  const int64_t ncols = src0->ne[0];
20
  const int64_t nrows = ggml_nrows(src0);
21
 
22
+ const dim3 block_dims(WARP_SIZE, 1, 1);
23
+ const dim3 block_nums(nrows, 1, 1);
24
+
25
+ reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
26
  }
ggml/src/ggml-cuda/sumrows.cuh CHANGED
@@ -1,5 +1,4 @@
1
  #include "common.cuh"
2
 
3
  void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
4
-
5
  void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
1
  #include "common.cuh"
2
 
3
  void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
 
4
  void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);