R0CKSTAR commited on
Commit
efa6dac
·
1 Parent(s): 09dd86a

CUDA: Fix clang warnings (llama/12540)

Browse files

Signed-off-by: Xiaodong Ye <[email protected]>

ggml/src/ggml-cuda/common.cuh CHANGED
@@ -243,14 +243,14 @@ static bool fp16_mma_available(const int cc) {
243
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
244
  return false;
245
  #else
246
- return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
247
  GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
248
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
249
  }
250
 
251
  // To be used for feature selection of external libraries, e.g. cuBLAS.
252
  static bool fp16_mma_hardware_available(const int cc) {
253
- return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA ||
254
  GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
255
  }
256
 
 
243
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
244
  return false;
245
  #else
246
+ return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
247
  GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
248
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
249
  }
250
 
251
  // To be used for feature selection of external libraries, e.g. cuBLAS.
252
  static bool fp16_mma_hardware_available(const int cc) {
253
+ return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
254
  GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
255
  }
256
 
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -1192,7 +1192,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1192
 
1193
  const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
1194
 
1195
- if (((cc >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
1196
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1197
  ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1198
  if (src0->type != GGML_TYPE_F16) {
 
1192
 
1193
  const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
1194
 
1195
+ if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
1196
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1197
  ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1198
  if (src0->type != GGML_TYPE_F16) {
ggml/src/ggml-cuda/mmq.cu CHANGED
@@ -27,8 +27,8 @@ void ggml_cuda_op_mul_mat_q(
27
  // The stream-k decomposition is only faster for recent NVIDIA GPUs.
28
  // Also its fixup needs to allocate a temporary buffer in the memory pool.
29
  // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
30
- const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA &&
31
- GGML_CUDA_CC_IS_NVIDIA(cc) && src1_ncols == ne11;
32
  const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
33
 
34
  switch (src0->type) {
 
27
  // The stream-k decomposition is only faster for recent NVIDIA GPUs.
28
  // Also its fixup needs to allocate a temporary buffer in the memory pool.
29
  // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
30
+ const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
31
+ ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
32
  const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
33
 
34
  switch (src0->type) {
ggml/src/ggml-cuda/mmq.cuh CHANGED
@@ -90,7 +90,7 @@ struct tile_x_sizes {
90
 
91
  static int get_mmq_x_max_host(const int cc) {
92
  return new_mma_available(cc) ? 128 :
93
- ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc) ?
94
  #ifdef GGML_CUDA_FORCE_MMQ
95
  128 : 64;
96
  #else
@@ -124,7 +124,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
124
 
125
  static int get_mmq_y_host(const int cc) {
126
  return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
127
- ((ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) ? 128 : 64);
128
  }
129
 
130
  static constexpr __device__ int get_mmq_y_device() {
@@ -2832,7 +2832,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
2832
  const int mmq_x_max = get_mmq_x_max_host(cc);
2833
  const int mmq_y = get_mmq_y_host(cc);
2834
  const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
2835
- const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc);
2836
 
2837
  int mmq_x_best = 0;
2838
  int nparts_best = INT_MAX;
 
90
 
91
  static int get_mmq_x_max_host(const int cc) {
92
  return new_mma_available(cc) ? 128 :
93
+ GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
94
  #ifdef GGML_CUDA_FORCE_MMQ
95
  128 : 64;
96
  #else
 
124
 
125
  static int get_mmq_y_host(const int cc) {
126
  return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
127
+ ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
128
  }
129
 
130
  static constexpr __device__ int get_mmq_y_device() {
 
2832
  const int mmq_x_max = get_mmq_x_max_host(cc);
2833
  const int mmq_y = get_mmq_y_host(cc);
2834
  const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
2835
+ const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
2836
 
2837
  int mmq_x_best = 0;
2838
  int nparts_best = INT_MAX;