Andreas Kieslinger commited on
Commit
8544072
·
1 Parent(s): 2ac53b2

CUDA: rename macros to avoid conflicts with WinAPI (llama/10736)

Browse files

* Renames NVIDIA GPU-architecture flags to avoid name clashes with WinAPI. (e.g. CC_PASCAL, GPU architecture or WinAPI pascal compiler flag?)

* Reverts erroneous rename in SYCL-code.

* Renames GGML_CUDA_MIN_CC_DP4A to GGML_CUDA_CC_DP4A.

* Renames the rest of the compute capability macros for consistency.

ggml/src/ggml-common.h CHANGED
@@ -473,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
473
  240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
474
  GGML_TABLE_END()
475
 
476
- //#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
477
  GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
478
  0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
479
  0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
 
473
  240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
474
  GGML_TABLE_END()
475
 
476
+ //#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
477
  GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
478
  0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
479
  0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -41,28 +41,28 @@
41
  #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
42
  #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
43
 
44
- #define CC_PASCAL 600
45
- #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46
- #define CC_VOLTA 700
47
- #define CC_TURING 750
48
- #define CC_AMPERE 800
49
- #define CC_OFFSET_AMD 1000000
50
 
51
  // GCN/CNDA, wave size is 64
52
- #define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
53
- #define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
54
- #define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
55
- #define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
56
- #define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
57
- #define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
58
 
59
  // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60
- #define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
61
- #define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
62
- #define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
63
 
64
- #define CC_QY1 210
65
- #define CC_QY2 220
66
 
67
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
68
 
@@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float
131
  typedef float2 dfloat2;
132
  #endif // GGML_CUDA_F16
133
 
134
- #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
135
  #define FP16_AVAILABLE
136
- #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
137
 
138
  #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
139
  #define FAST_FP16_AVAILABLE
140
  #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
141
 
142
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
143
  #define FP16_MMA_AVAILABLE
144
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
145
 
146
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
147
  #define INT8_MMA_AVAILABLE
148
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
149
 
150
- #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
151
  #define FLASH_ATTN_AVAILABLE
152
- #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
153
 
154
  static constexpr bool fast_fp16_available(const int cc) {
155
- return cc >= CC_PASCAL && cc != 610;
156
  }
157
 
158
  static constexpr bool fp16_mma_available(const int cc) {
159
- return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
160
  }
161
 
162
  static constexpr bool int8_mma_available(const int cc) {
163
- return cc < CC_OFFSET_AMD && cc >= CC_TURING;
164
  }
165
 
166
  [[noreturn]]
@@ -187,7 +187,7 @@ static __device__ void no_device_code(
187
  #endif // __CUDA_ARCH__
188
 
189
  static __device__ __forceinline__ int warp_reduce_sum(int x) {
190
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
191
  return __reduce_add_sync(0xffffffff, x);
192
  #else
193
  #pragma unroll
@@ -195,7 +195,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
195
  x += __shfl_xor_sync(0xffffffff, x, offset, 32);
196
  }
197
  return x;
198
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
199
  }
200
 
201
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
@@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
284
  }
285
 
286
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
287
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
288
  #pragma unroll
289
  for (int offset = 16; offset > 0; offset >>= 1) {
290
  x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
@@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
293
  #else
294
  GGML_UNUSED(x);
295
  NO_DEVICE_CODE;
296
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
297
  }
298
 
299
  #if CUDART_VERSION < CUDART_HMASK
@@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
333
 
334
  #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
335
 
336
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
337
  return __dp4a(a, b, c);
338
- #else // __CUDA_ARCH__ >= MIN_CC_DP4A
339
  const int8_t * a8 = (const int8_t *) &a;
340
  const int8_t * b8 = (const int8_t *) &b;
341
  return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
342
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
343
 
344
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
345
  }
 
41
  #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
42
  #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
43
 
44
+ #define GGML_CUDA_CC_PASCAL 600
45
+ #define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46
+ #define GGML_CUDA_CC_VOLTA 700
47
+ #define GGML_CUDA_CC_TURING 750
48
+ #define GGML_CUDA_CC_AMPERE 800
49
+ #define GGML_CUDA_CC_OFFSET_AMD 1000000
50
 
51
  // GCN/CNDA, wave size is 64
52
+ #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
53
+ #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
54
+ #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
55
+ #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
56
+ #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
57
+ #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
58
 
59
  // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60
+ #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
61
+ #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
62
+ #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
63
 
64
+ #define GGML_CUDA_CC_QY1 210
65
+ #define GGML_CUDA_CC_QY2 220
66
 
67
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
68
 
 
131
  typedef float2 dfloat2;
132
  #endif // GGML_CUDA_F16
133
 
134
+ #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
135
  #define FP16_AVAILABLE
136
+ #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
137
 
138
  #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
139
  #define FAST_FP16_AVAILABLE
140
  #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
141
 
142
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
143
  #define FP16_MMA_AVAILABLE
144
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
145
 
146
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
147
  #define INT8_MMA_AVAILABLE
148
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
149
 
150
+ #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
151
  #define FLASH_ATTN_AVAILABLE
152
+ #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
153
 
154
  static constexpr bool fast_fp16_available(const int cc) {
155
+ return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
156
  }
157
 
158
  static constexpr bool fp16_mma_available(const int cc) {
159
+ return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
160
  }
161
 
162
  static constexpr bool int8_mma_available(const int cc) {
163
+ return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
164
  }
165
 
166
  [[noreturn]]
 
187
  #endif // __CUDA_ARCH__
188
 
189
  static __device__ __forceinline__ int warp_reduce_sum(int x) {
190
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
191
  return __reduce_add_sync(0xffffffff, x);
192
  #else
193
  #pragma unroll
 
195
  x += __shfl_xor_sync(0xffffffff, x, offset, 32);
196
  }
197
  return x;
198
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
199
  }
200
 
201
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
 
284
  }
285
 
286
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
287
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
288
  #pragma unroll
289
  for (int offset = 16; offset > 0; offset >>= 1) {
290
  x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
 
293
  #else
294
  GGML_UNUSED(x);
295
  NO_DEVICE_CODE;
296
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
297
  }
298
 
299
  #if CUDART_VERSION < CUDART_HMASK
 
333
 
334
  #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
335
 
336
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
337
  return __dp4a(a, b, c);
338
+ #else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
339
  const int8_t * a8 = (const int8_t *) &a;
340
  const int8_t * b8 = (const int8_t *) &b;
341
  return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
342
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
343
 
344
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
345
  }
ggml/src/ggml-cuda/convert.cu CHANGED
@@ -26,7 +26,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
26
 
27
  template <bool need_check>
28
  static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
29
- #if __CUDA_ARCH__ >= CC_PASCAL
30
  constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
31
 
32
  const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
@@ -64,7 +64,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
64
  GGML_UNUSED(y);
65
  GGML_UNUSED(k);
66
  NO_DEVICE_CODE;
67
- #endif // __CUDA_ARCH__ >= CC_PASCAL
68
  }
69
 
70
  template<typename dst_t>
@@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
599
  case GGML_TYPE_Q5_1:
600
  return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
601
  case GGML_TYPE_Q8_0:
602
- if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) {
603
  return dequantize_block_q8_0_f16_cuda;
604
  }
605
  return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
 
26
 
27
  template <bool need_check>
28
  static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
29
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
30
  constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
31
 
32
  const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
 
64
  GGML_UNUSED(y);
65
  GGML_UNUSED(k);
66
  NO_DEVICE_CODE;
67
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
68
  }
69
 
70
  template<typename dst_t>
 
599
  case GGML_TYPE_Q5_1:
600
  return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
601
  case GGML_TYPE_Q8_0:
602
+ if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) {
603
  return dequantize_block_q8_0_f16_cuda;
604
  }
605
  return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
ggml/src/ggml-cuda/fattn.cu CHANGED
@@ -304,7 +304,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
304
  const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
305
 
306
  // On AMD the tile kernels perform poorly, use the vec kernel instead:
307
- if (cc >= CC_OFFSET_AMD) {
308
  if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
309
  ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
310
  } else {
 
304
  const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
305
 
306
  // On AMD the tile kernels perform poorly, use the vec kernel instead:
307
+ if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
308
  if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
309
  ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
310
  } else {
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -177,7 +177,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
177
  info.devices[id].smpb = prop.sharedMemPerBlock;
178
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
179
  info.devices[id].smpbo = prop.sharedMemPerBlock;
180
- info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
181
  #else
182
  info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
183
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
@@ -1081,7 +1081,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1081
 
1082
  const int compute_capability = ggml_cuda_info().devices[id].cc;
1083
 
1084
- if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
1085
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1086
  ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1087
  if (src0->type != GGML_TYPE_F16) {
@@ -1108,7 +1108,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1108
  const half beta_f16 = 0.0f;
1109
 
1110
  cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
1111
- if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
1112
  cu_compute_type = CUBLAS_COMPUTE_32F;
1113
  }
1114
 
@@ -1612,7 +1612,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1612
  cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
1613
  cudaDataType_t cu_data_type = CUDA_R_16F;
1614
 
1615
- if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
1616
  cu_compute_type = CUBLAS_COMPUTE_32F;
1617
  }
1618
 
@@ -2357,7 +2357,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2357
  std::vector<void *> ggml_cuda_cpy_fn_ptrs;
2358
 
2359
  if (cuda_ctx->cuda_graph->graph == nullptr) {
2360
- if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2361
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2362
  #ifndef NDEBUG
2363
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
@@ -3028,7 +3028,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3028
  return true;
3029
  }
3030
  const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3031
- return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
3032
  }
3033
  case GGML_OP_CROSS_ENTROPY_LOSS:
3034
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
 
177
  info.devices[id].smpb = prop.sharedMemPerBlock;
178
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
179
  info.devices[id].smpbo = prop.sharedMemPerBlock;
180
+ info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
181
  #else
182
  info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
183
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
 
1081
 
1082
  const int compute_capability = ggml_cuda_info().devices[id].cc;
1083
 
1084
+ if (compute_capability >= GGML_CUDA_CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
1085
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1086
  ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1087
  if (src0->type != GGML_TYPE_F16) {
 
1108
  const half beta_f16 = 0.0f;
1109
 
1110
  cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
1111
+ if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
1112
  cu_compute_type = CUBLAS_COMPUTE_32F;
1113
  }
1114
 
 
1612
  cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
1613
  cudaDataType_t cu_data_type = CUDA_R_16F;
1614
 
1615
+ if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
1616
  cu_compute_type = CUBLAS_COMPUTE_32F;
1617
  }
1618
 
 
2357
  std::vector<void *> ggml_cuda_cpy_fn_ptrs;
2358
 
2359
  if (cuda_ctx->cuda_graph->graph == nullptr) {
2360
+ if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
2361
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2362
  #ifndef NDEBUG
2363
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
 
3028
  return true;
3029
  }
3030
  const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3031
+ return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
3032
  }
3033
  case GGML_OP_CROSS_ENTROPY_LOSS:
3034
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
ggml/src/ggml-cuda/mma.cuh CHANGED
@@ -171,7 +171,7 @@ struct mma_int_C_I16J8 {
171
 
172
  __device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
173
  #ifdef INT8_MMA_AVAILABLE
174
- #if __CUDA_ARCH__ >= CC_AMPERE
175
  asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
176
  : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
177
  : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
@@ -183,7 +183,7 @@ struct mma_int_C_I16J8 {
183
  asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
184
  : "+r"(x[2]), "+r"(x[3])
185
  : "r"(mma_A.x[1]), "r"(mma_B.x[0]));
186
- #endif // __CUDA_ARCH__ >= CC_AMPERE
187
  #else
188
  GGML_UNUSED(mma_A);
189
  GGML_UNUSED(mma_B);
@@ -193,7 +193,7 @@ struct mma_int_C_I16J8 {
193
 
194
  __device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
195
  #ifdef INT8_MMA_AVAILABLE
196
- #if __CUDA_ARCH__ >= CC_AMPERE
197
  asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
198
  : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
199
  : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
@@ -211,7 +211,7 @@ struct mma_int_C_I16J8 {
211
  asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
212
  : "+r"(x[2]), "+r"(x[3])
213
  : "r"(mma_A.x[3]), "r"(mma_B.x[1]));
214
- #endif // __CUDA_ARCH__ >= CC_AMPERE
215
  #else
216
  GGML_UNUSED(mma_A);
217
  GGML_UNUSED(mma_B);
 
171
 
172
  __device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
173
  #ifdef INT8_MMA_AVAILABLE
174
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
175
  asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
176
  : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
177
  : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
 
183
  asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
184
  : "+r"(x[2]), "+r"(x[3])
185
  : "r"(mma_A.x[1]), "r"(mma_B.x[0]));
186
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
187
  #else
188
  GGML_UNUSED(mma_A);
189
  GGML_UNUSED(mma_B);
 
193
 
194
  __device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
195
  #ifdef INT8_MMA_AVAILABLE
196
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
197
  asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
198
  : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
199
  : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
 
211
  asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
212
  : "+r"(x[2]), "+r"(x[3])
213
  : "r"(mma_A.x[3]), "r"(mma_B.x[1]));
214
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
215
  #else
216
  GGML_UNUSED(mma_A);
217
  GGML_UNUSED(mma_B);
ggml/src/ggml-cuda/mmq.cu CHANGED
@@ -27,7 +27,7 @@ void ggml_cuda_op_mul_mat_q(
27
  // The stream-k decomposition is only faster for recent NVIDIA GPUs.
28
  // Also its fixup needs to allocate a temporary buffer in the memory pool.
29
  // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
30
- const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
31
  const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
32
 
33
  switch (src0->type) {
@@ -136,7 +136,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
136
  return true;
137
  }
138
 
139
- if (cc < MIN_CC_DP4A) {
140
  return false;
141
  }
142
 
@@ -144,9 +144,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
144
  return true;
145
  #endif //GGML_CUDA_FORCE_MMQ
146
 
147
- if (cc < CC_OFFSET_AMD) {
148
- return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
149
  }
150
 
151
- return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
152
  }
 
27
  // The stream-k decomposition is only faster for recent NVIDIA GPUs.
28
  // Also its fixup needs to allocate a temporary buffer in the memory pool.
29
  // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
30
+ const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
31
  const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
32
 
33
  switch (src0->type) {
 
136
  return true;
137
  }
138
 
139
+ if (cc < GGML_CUDA_CC_DP4A) {
140
  return false;
141
  }
142
 
 
144
  return true;
145
  #endif //GGML_CUDA_FORCE_MMQ
146
 
147
+ if (cc < GGML_CUDA_CC_OFFSET_AMD) {
148
+ return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
149
  }
150
 
151
+ return (cc < GGML_CUDA_CC_RDNA3 && cc != GGML_CUDA_CC_CDNA && cc != GGML_CUDA_CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
152
  }
ggml/src/ggml-cuda/mmq.cuh CHANGED
@@ -89,9 +89,9 @@ struct tile_x_sizes {
89
  static constexpr int get_mmq_x_max_host(const int cc) {
90
  return int8_mma_available(cc) ? 128 :
91
  #ifdef GGML_CUDA_FORCE_MMQ
92
- cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
93
  #else
94
- cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
95
  #endif // GGML_CUDA_FORCE_MMQ
96
  }
97
 
@@ -104,23 +104,23 @@ static constexpr __device__ int get_mmq_x_max_device() {
104
  return 128;
105
  #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
106
 
107
- #if __CUDA_ARCH__ >= CC_VOLTA
108
  #ifdef GGML_CUDA_FORCE_MMQ
109
  return MMQ_DP4A_MAX_BATCH_SIZE;
110
  #else // GGML_CUDA_FORCE_MMQ
111
  return 128;
112
  #endif // GGML_CUDA_FORCE_MMQ
113
- #else // __CUDA_ARCH__ >= CC_VOLTA
114
 
115
  return 64;
116
- #endif // __CUDA_ARCH__ >= CC_VOLTA
117
 
118
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
119
  #endif // INT8_MMA_AVAILABLE
120
  }
121
 
122
  static constexpr int get_mmq_y_host(const int cc) {
123
- return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= CC_VOLTA ? 128 : 64);
124
  }
125
 
126
  static constexpr __device__ int get_mmq_y_device() {
@@ -131,11 +131,11 @@ static constexpr __device__ int get_mmq_y_device() {
131
  return 128;
132
  #endif // defined RDNA1
133
  #else
134
- #if __CUDA_ARCH__ >= CC_VOLTA
135
  return 128;
136
  #else
137
  return 64;
138
- #endif // __CUDA_ARCH__ >= CC_VOLTA
139
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
140
  }
141
 
@@ -2574,11 +2574,11 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
2574
  __launch_bounds__(WARP_SIZE*nwarps, 2)
2575
  #endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
2576
  #else
2577
- #if __CUDA_ARCH__ >= CC_VOLTA
2578
  __launch_bounds__(WARP_SIZE*nwarps, 1)
2579
  #else
2580
  __launch_bounds__(WARP_SIZE*nwarps, 2)
2581
- #endif // __CUDA_ARCH__ >= CC_VOLTA
2582
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
2583
  static __global__ void mul_mat_q(
2584
  const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
@@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q(
2594
  constexpr int mmq_y = get_mmq_y_device();
2595
 
2596
  // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
2597
- #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
2598
  {
2599
  constexpr bool fixup = false;
2600
  mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
@@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q(
2602
  blockIdx.x, blockIdx.y, 0, ne00/qk);
2603
  return;
2604
  }
2605
- #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
2606
 
2607
  const int64_t blocks_per_ne00 = ne00 / qk;
2608
  constexpr int blocks_per_iter = MMQ_ITER_K / qk;
@@ -2825,7 +2825,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
2825
  const int mmq_x_max = get_mmq_x_max_host(cc);
2826
  const int mmq_y = get_mmq_y_host(cc);
2827
  const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
2828
- const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
2829
 
2830
  int mmq_x_best = 0;
2831
  int nparts_best = INT_MAX;
 
89
  static constexpr int get_mmq_x_max_host(const int cc) {
90
  return int8_mma_available(cc) ? 128 :
91
  #ifdef GGML_CUDA_FORCE_MMQ
92
+ cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;
93
  #else
94
+ cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
95
  #endif // GGML_CUDA_FORCE_MMQ
96
  }
97
 
 
104
  return 128;
105
  #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
106
 
107
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
108
  #ifdef GGML_CUDA_FORCE_MMQ
109
  return MMQ_DP4A_MAX_BATCH_SIZE;
110
  #else // GGML_CUDA_FORCE_MMQ
111
  return 128;
112
  #endif // GGML_CUDA_FORCE_MMQ
113
+ #else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
114
 
115
  return 64;
116
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
117
 
118
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
119
  #endif // INT8_MMA_AVAILABLE
120
  }
121
 
122
  static constexpr int get_mmq_y_host(const int cc) {
123
+ return cc >= GGML_CUDA_CC_OFFSET_AMD ? (cc == GGML_CUDA_CC_RDNA1 ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
124
  }
125
 
126
  static constexpr __device__ int get_mmq_y_device() {
 
131
  return 128;
132
  #endif // defined RDNA1
133
  #else
134
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
135
  return 128;
136
  #else
137
  return 64;
138
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
139
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
140
  }
141
 
 
2574
  __launch_bounds__(WARP_SIZE*nwarps, 2)
2575
  #endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
2576
  #else
2577
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
2578
  __launch_bounds__(WARP_SIZE*nwarps, 1)
2579
  #else
2580
  __launch_bounds__(WARP_SIZE*nwarps, 2)
2581
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
2582
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
2583
  static __global__ void mul_mat_q(
2584
  const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
 
2594
  constexpr int mmq_y = get_mmq_y_device();
2595
 
2596
  // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
2597
+ #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
2598
  {
2599
  constexpr bool fixup = false;
2600
  mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
 
2602
  blockIdx.x, blockIdx.y, 0, ne00/qk);
2603
  return;
2604
  }
2605
+ #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
2606
 
2607
  const int64_t blocks_per_ne00 = ne00 / qk;
2608
  constexpr int blocks_per_iter = MMQ_ITER_K / qk;
 
2825
  const int mmq_x_max = get_mmq_x_max_host(cc);
2826
  const int mmq_y = get_mmq_y_host(cc);
2827
  const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
2828
+ const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
2829
 
2830
  int mmq_x_best = 0;
2831
  int nparts_best = INT_MAX;
ggml/src/ggml-cuda/mmvq.cu CHANGED
@@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
142
  int64_t nwarps = 1;
143
  int64_t rows_per_cuda_block = 1;
144
 
145
- if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
146
  switch(ncols_y) {
147
  case 1:
148
  nwarps = 4;
 
142
  int64_t nwarps = 1;
143
  int64_t rows_per_cuda_block = 1;
144
 
145
+ if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
146
  switch(ncols_y) {
147
  case 1:
148
  nwarps = 4;
ggml/src/ggml-cuda/sum.cu CHANGED
@@ -3,8 +3,6 @@
3
  #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
4
 
5
  #ifdef USE_CUB
6
- // On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
7
- // For this reason CUB must be included BEFORE anything else.
8
  #include <cub/cub.cuh>
9
  using namespace cub;
10
  #endif // USE_CUB
 
3
  #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
4
 
5
  #ifdef USE_CUB
 
 
6
  #include <cub/cub.cuh>
7
  using namespace cub;
8
  #endif // USE_CUB