Spaces:
Running
Running
uvos
commited on
Commit
·
ed08269
1
Parent(s):
4850c24
CUDA/HIP: add support for selectable warp size to mmv (llama/11519)
Browse files- ggml/src/ggml-cuda/common.cuh +8 -0
- ggml/src/ggml-cuda/mmv.cu +24 -14
- ggml/src/ggml-cuda/vendors/hip.h +2 -0
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -176,6 +176,14 @@ static constexpr bool new_mma_available(const int cc) {
|
|
| 176 |
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
|
| 177 |
}
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
[[noreturn]]
|
| 180 |
static __device__ void no_device_code(
|
| 181 |
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
|
|
|
| 176 |
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
|
| 177 |
}
|
| 178 |
|
| 179 |
+
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
| 180 |
+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 181 |
+
return __AMDGCN_WAVEFRONT_SIZE;
|
| 182 |
+
#else
|
| 183 |
+
return 32;
|
| 184 |
+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
[[noreturn]]
|
| 188 |
static __device__ void no_device_code(
|
| 189 |
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
ggml/src/ggml-cuda/mmv.cu
CHANGED
|
@@ -5,9 +5,10 @@ template <typename T, typename type_acc, int block_size>
|
|
| 5 |
static __global__ void mul_mat_vec(
|
| 6 |
const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
|
| 7 |
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
|
| 8 |
-
const int64_t row
|
| 9 |
-
const int64_t channel
|
| 10 |
-
const int tid
|
|
|
|
| 11 |
|
| 12 |
x += (channel/channel_ratio)*stride_channel_x + row*stride_row;
|
| 13 |
y += channel *stride_channel_y;
|
|
@@ -18,8 +19,8 @@ static __global__ void mul_mat_vec(
|
|
| 18 |
extern __shared__ char data_mmv[];
|
| 19 |
float * buf_iw = (float *) data_mmv;
|
| 20 |
|
| 21 |
-
if (block_size >
|
| 22 |
-
if (tid <
|
| 23 |
buf_iw[tid] = 0.0f;
|
| 24 |
}
|
| 25 |
__syncthreads();
|
|
@@ -67,16 +68,16 @@ static __global__ void mul_mat_vec(
|
|
| 67 |
static_assert(std::is_same<T, void>::value, "unsupported type");
|
| 68 |
}
|
| 69 |
|
| 70 |
-
sumf = warp_reduce_sum(sumf);
|
| 71 |
|
| 72 |
-
if (block_size >
|
| 73 |
-
buf_iw[tid/
|
| 74 |
__syncthreads();
|
| 75 |
-
if (tid >=
|
| 76 |
return;
|
| 77 |
}
|
| 78 |
sumf = buf_iw[tid];
|
| 79 |
-
sumf = warp_reduce_sum(sumf);
|
| 80 |
}
|
| 81 |
|
| 82 |
if (tid != 0) {
|
|
@@ -96,10 +97,19 @@ static void launch_mul_mat_vec_cuda(
|
|
| 96 |
GGML_ASSERT(stride_row % 2 == 0);
|
| 97 |
GGML_ASSERT(nchannels_y % nchannels_x == 0);
|
| 98 |
const int64_t channel_ratio = nchannels_y / nchannels_x;
|
|
|
|
|
|
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
|
| 104 |
if (niter < niter_best) {
|
| 105 |
niter_best = niter;
|
|
@@ -107,7 +117,7 @@ static void launch_mul_mat_vec_cuda(
|
|
| 107 |
}
|
| 108 |
}
|
| 109 |
|
| 110 |
-
const int smem =
|
| 111 |
const dim3 block_nums(nrows, 1, nchannels_y);
|
| 112 |
const dim3 block_dims(block_size_best, 1, 1);
|
| 113 |
switch (block_size_best) {
|
|
|
|
| 5 |
static __global__ void mul_mat_vec(
|
| 6 |
const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
|
| 7 |
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
|
| 8 |
+
const int64_t row = blockIdx.x;
|
| 9 |
+
const int64_t channel = blockIdx.z;
|
| 10 |
+
const int tid = threadIdx.x;
|
| 11 |
+
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
| 12 |
|
| 13 |
x += (channel/channel_ratio)*stride_channel_x + row*stride_row;
|
| 14 |
y += channel *stride_channel_y;
|
|
|
|
| 19 |
extern __shared__ char data_mmv[];
|
| 20 |
float * buf_iw = (float *) data_mmv;
|
| 21 |
|
| 22 |
+
if (block_size > warp_size) {
|
| 23 |
+
if (tid < warp_size) {
|
| 24 |
buf_iw[tid] = 0.0f;
|
| 25 |
}
|
| 26 |
__syncthreads();
|
|
|
|
| 68 |
static_assert(std::is_same<T, void>::value, "unsupported type");
|
| 69 |
}
|
| 70 |
|
| 71 |
+
sumf = warp_reduce_sum<warp_size>(sumf);
|
| 72 |
|
| 73 |
+
if (block_size > warp_size) {
|
| 74 |
+
buf_iw[tid/warp_size] = sumf;
|
| 75 |
__syncthreads();
|
| 76 |
+
if (tid >= warp_size) {
|
| 77 |
return;
|
| 78 |
}
|
| 79 |
sumf = buf_iw[tid];
|
| 80 |
+
sumf = warp_reduce_sum<warp_size>(sumf);
|
| 81 |
}
|
| 82 |
|
| 83 |
if (tid != 0) {
|
|
|
|
| 97 |
GGML_ASSERT(stride_row % 2 == 0);
|
| 98 |
GGML_ASSERT(nchannels_y % nchannels_x == 0);
|
| 99 |
const int64_t channel_ratio = nchannels_y / nchannels_x;
|
| 100 |
+
int device;
|
| 101 |
+
int warp_size;
|
| 102 |
|
| 103 |
+
CUDA_CHECK(cudaGetDevice(&device));
|
| 104 |
+
warp_size = ggml_cuda_info().devices[device].warp_size;
|
| 105 |
+
|
| 106 |
+
int64_t block_size_best = warp_size;
|
| 107 |
+
int64_t niter_best = (ncols + 2*warp_size - 1) / (2*warp_size);
|
| 108 |
+
int64_t max_block_size = 256;
|
| 109 |
+
if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
|
| 110 |
+
max_block_size = 128;
|
| 111 |
+
}
|
| 112 |
+
for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
|
| 113 |
const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
|
| 114 |
if (niter < niter_best) {
|
| 115 |
niter_best = niter;
|
|
|
|
| 117 |
}
|
| 118 |
}
|
| 119 |
|
| 120 |
+
const int smem = warp_size*sizeof(float);
|
| 121 |
const dim3 block_nums(nrows, 1, nchannels_y);
|
| 122 |
const dim3 block_dims(block_size_best, 1, 1);
|
| 123 |
switch (block_size_best) {
|
ggml/src/ggml-cuda/vendors/hip.h
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
#pragma once
|
| 2 |
|
|
|
|
| 3 |
#include <hip/hip_runtime.h>
|
| 4 |
#include <hipblas/hipblas.h>
|
| 5 |
#include <hip/hip_fp16.h>
|
|
@@ -8,6 +9,7 @@
|
|
| 8 |
// for rocblas_initialize()
|
| 9 |
#include "rocblas/rocblas.h"
|
| 10 |
#endif // __HIP_PLATFORM_AMD__
|
|
|
|
| 11 |
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
| 12 |
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
| 13 |
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
|
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
+
#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
|
| 4 |
#include <hip/hip_runtime.h>
|
| 5 |
#include <hipblas/hipblas.h>
|
| 6 |
#include <hip/hip_fp16.h>
|
|
|
|
| 9 |
// for rocblas_initialize()
|
| 10 |
#include "rocblas/rocblas.h"
|
| 11 |
#endif // __HIP_PLATFORM_AMD__
|
| 12 |
+
|
| 13 |
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
| 14 |
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
| 15 |
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|