Spaces:
Sleeping
Sleeping
R0CKSTAR
commited on
Commit
·
efa6dac
1
Parent(s):
09dd86a
CUDA: Fix clang warnings (llama/12540)
Browse filesSigned-off-by: Xiaodong Ye <[email protected]>
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -243,14 +243,14 @@ static bool fp16_mma_available(const int cc) {
|
|
| 243 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
| 244 |
return false;
|
| 245 |
#else
|
| 246 |
-
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
|
| 247 |
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
|
| 248 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
| 249 |
}
|
| 250 |
|
| 251 |
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
| 252 |
static bool fp16_mma_hardware_available(const int cc) {
|
| 253 |
-
return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA ||
|
| 254 |
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
|
| 255 |
}
|
| 256 |
|
|
|
|
| 243 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
| 244 |
return false;
|
| 245 |
#else
|
| 246 |
+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
| 247 |
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
|
| 248 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
| 249 |
}
|
| 250 |
|
| 251 |
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
| 252 |
static bool fp16_mma_hardware_available(const int cc) {
|
| 253 |
+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
|
| 254 |
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
|
| 255 |
}
|
| 256 |
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -1192,7 +1192,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
| 1192 |
|
| 1193 |
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
|
| 1194 |
|
| 1195 |
-
if (((cc >= GGML_CUDA_CC_VOLTA
|
| 1196 |
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
| 1197 |
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
| 1198 |
if (src0->type != GGML_TYPE_F16) {
|
|
|
|
| 1192 |
|
| 1193 |
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
|
| 1194 |
|
| 1195 |
+
if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
|
| 1196 |
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
| 1197 |
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
| 1198 |
if (src0->type != GGML_TYPE_F16) {
|
ggml/src/ggml-cuda/mmq.cu
CHANGED
|
@@ -27,8 +27,8 @@ void ggml_cuda_op_mul_mat_q(
|
|
| 27 |
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
| 28 |
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
| 29 |
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
| 30 |
-
const bool use_stream_k =
|
| 31 |
-
|
| 32 |
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
|
| 33 |
|
| 34 |
switch (src0->type) {
|
|
|
|
| 27 |
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
| 28 |
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
| 29 |
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
| 30 |
+
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
|
| 31 |
+
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
| 32 |
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
|
| 33 |
|
| 34 |
switch (src0->type) {
|
ggml/src/ggml-cuda/mmq.cuh
CHANGED
|
@@ -90,7 +90,7 @@ struct tile_x_sizes {
|
|
| 90 |
|
| 91 |
static int get_mmq_x_max_host(const int cc) {
|
| 92 |
return new_mma_available(cc) ? 128 :
|
| 93 |
-
|
| 94 |
#ifdef GGML_CUDA_FORCE_MMQ
|
| 95 |
128 : 64;
|
| 96 |
#else
|
|
@@ -124,7 +124,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
|
| 124 |
|
| 125 |
static int get_mmq_y_host(const int cc) {
|
| 126 |
return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
|
| 127 |
-
((
|
| 128 |
}
|
| 129 |
|
| 130 |
static constexpr __device__ int get_mmq_y_device() {
|
|
@@ -2832,7 +2832,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
|
| 2832 |
const int mmq_x_max = get_mmq_x_max_host(cc);
|
| 2833 |
const int mmq_y = get_mmq_y_host(cc);
|
| 2834 |
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
|
| 2835 |
-
const bool use_stream_k =
|
| 2836 |
|
| 2837 |
int mmq_x_best = 0;
|
| 2838 |
int nparts_best = INT_MAX;
|
|
|
|
| 90 |
|
| 91 |
static int get_mmq_x_max_host(const int cc) {
|
| 92 |
return new_mma_available(cc) ? 128 :
|
| 93 |
+
GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
|
| 94 |
#ifdef GGML_CUDA_FORCE_MMQ
|
| 95 |
128 : 64;
|
| 96 |
#else
|
|
|
|
| 124 |
|
| 125 |
static int get_mmq_y_host(const int cc) {
|
| 126 |
return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
|
| 127 |
+
((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
|
| 128 |
}
|
| 129 |
|
| 130 |
static constexpr __device__ int get_mmq_y_device() {
|
|
|
|
| 2832 |
const int mmq_x_max = get_mmq_x_max_host(cc);
|
| 2833 |
const int mmq_y = get_mmq_y_host(cc);
|
| 2834 |
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
|
| 2835 |
+
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
| 2836 |
|
| 2837 |
int mmq_x_best = 0;
|
| 2838 |
int nparts_best = INT_MAX;
|