uvos commited on
Commit
ed08269
·
1 Parent(s): 4850c24

CUDA/HIP: add support for selectable warp size to mmv (llama/11519)

Browse files
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -176,6 +176,14 @@ static constexpr bool new_mma_available(const int cc) {
176
  return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
177
  }
178
 
 
 
 
 
 
 
 
 
179
  [[noreturn]]
180
  static __device__ void no_device_code(
181
  const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
 
176
  return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
177
  }
178
 
179
+ static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
180
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
181
+ return __AMDGCN_WAVEFRONT_SIZE;
182
+ #else
183
+ return 32;
184
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
185
+ }
186
+
187
  [[noreturn]]
188
  static __device__ void no_device_code(
189
  const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
ggml/src/ggml-cuda/mmv.cu CHANGED
@@ -5,9 +5,10 @@ template <typename T, typename type_acc, int block_size>
5
  static __global__ void mul_mat_vec(
6
  const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
7
  const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
8
- const int64_t row = blockIdx.x;
9
- const int64_t channel = blockIdx.z;
10
- const int tid = threadIdx.x;
 
11
 
12
  x += (channel/channel_ratio)*stride_channel_x + row*stride_row;
13
  y += channel *stride_channel_y;
@@ -18,8 +19,8 @@ static __global__ void mul_mat_vec(
18
  extern __shared__ char data_mmv[];
19
  float * buf_iw = (float *) data_mmv;
20
 
21
- if (block_size > WARP_SIZE) {
22
- if (tid < WARP_SIZE) {
23
  buf_iw[tid] = 0.0f;
24
  }
25
  __syncthreads();
@@ -67,16 +68,16 @@ static __global__ void mul_mat_vec(
67
  static_assert(std::is_same<T, void>::value, "unsupported type");
68
  }
69
 
70
- sumf = warp_reduce_sum(sumf);
71
 
72
- if (block_size > WARP_SIZE) {
73
- buf_iw[tid/WARP_SIZE] = sumf;
74
  __syncthreads();
75
- if (tid >= WARP_SIZE) {
76
  return;
77
  }
78
  sumf = buf_iw[tid];
79
- sumf = warp_reduce_sum(sumf);
80
  }
81
 
82
  if (tid != 0) {
@@ -96,10 +97,19 @@ static void launch_mul_mat_vec_cuda(
96
  GGML_ASSERT(stride_row % 2 == 0);
97
  GGML_ASSERT(nchannels_y % nchannels_x == 0);
98
  const int64_t channel_ratio = nchannels_y / nchannels_x;
 
 
99
 
100
- int64_t block_size_best = WARP_SIZE;
101
- int64_t niter_best = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
102
- for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
 
 
 
 
 
 
 
103
  const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
104
  if (niter < niter_best) {
105
  niter_best = niter;
@@ -107,7 +117,7 @@ static void launch_mul_mat_vec_cuda(
107
  }
108
  }
109
 
110
- const int smem = WARP_SIZE*sizeof(float);
111
  const dim3 block_nums(nrows, 1, nchannels_y);
112
  const dim3 block_dims(block_size_best, 1, 1);
113
  switch (block_size_best) {
 
5
  static __global__ void mul_mat_vec(
6
  const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
7
  const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
8
+ const int64_t row = blockIdx.x;
9
+ const int64_t channel = blockIdx.z;
10
+ const int tid = threadIdx.x;
11
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
12
 
13
  x += (channel/channel_ratio)*stride_channel_x + row*stride_row;
14
  y += channel *stride_channel_y;
 
19
  extern __shared__ char data_mmv[];
20
  float * buf_iw = (float *) data_mmv;
21
 
22
+ if (block_size > warp_size) {
23
+ if (tid < warp_size) {
24
  buf_iw[tid] = 0.0f;
25
  }
26
  __syncthreads();
 
68
  static_assert(std::is_same<T, void>::value, "unsupported type");
69
  }
70
 
71
+ sumf = warp_reduce_sum<warp_size>(sumf);
72
 
73
+ if (block_size > warp_size) {
74
+ buf_iw[tid/warp_size] = sumf;
75
  __syncthreads();
76
+ if (tid >= warp_size) {
77
  return;
78
  }
79
  sumf = buf_iw[tid];
80
+ sumf = warp_reduce_sum<warp_size>(sumf);
81
  }
82
 
83
  if (tid != 0) {
 
97
  GGML_ASSERT(stride_row % 2 == 0);
98
  GGML_ASSERT(nchannels_y % nchannels_x == 0);
99
  const int64_t channel_ratio = nchannels_y / nchannels_x;
100
+ int device;
101
+ int warp_size;
102
 
103
+ CUDA_CHECK(cudaGetDevice(&device));
104
+ warp_size = ggml_cuda_info().devices[device].warp_size;
105
+
106
+ int64_t block_size_best = warp_size;
107
+ int64_t niter_best = (ncols + 2*warp_size - 1) / (2*warp_size);
108
+ int64_t max_block_size = 256;
109
+ if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
110
+ max_block_size = 128;
111
+ }
112
+ for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
113
  const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
114
  if (niter < niter_best) {
115
  niter_best = niter;
 
117
  }
118
  }
119
 
120
+ const int smem = warp_size*sizeof(float);
121
  const dim3 block_nums(nrows, 1, nchannels_y);
122
  const dim3 block_dims(block_size_best, 1, 1);
123
  switch (block_size_best) {
ggml/src/ggml-cuda/vendors/hip.h CHANGED
@@ -1,5 +1,6 @@
1
  #pragma once
2
 
 
3
  #include <hip/hip_runtime.h>
4
  #include <hipblas/hipblas.h>
5
  #include <hip/hip_fp16.h>
@@ -8,6 +9,7 @@
8
  // for rocblas_initialize()
9
  #include "rocblas/rocblas.h"
10
  #endif // __HIP_PLATFORM_AMD__
 
11
  #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
12
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
13
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 
1
  #pragma once
2
 
3
+ #define HIP_ENABLE_WARP_SYNC_BUILTINS 1
4
  #include <hip/hip_runtime.h>
5
  #include <hipblas/hipblas.h>
6
  #include <hip/hip_fp16.h>
 
9
  // for rocblas_initialize()
10
  #include "rocblas/rocblas.h"
11
  #endif // __HIP_PLATFORM_AMD__
12
+
13
  #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
14
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
15
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F