Spaces:
Running
Running
Andreas Kieslinger
commited on
Commit
·
8544072
1
Parent(s):
2ac53b2
CUDA: rename macros to avoid conflicts with WinAPI (llama/10736)
Browse files* Renames NVIDIA GPU-architecture flags to avoid name clashes with WinAPI. (e.g. CC_PASCAL, GPU architecture or WinAPI pascal compiler flag?)
* Reverts erroneous rename in SYCL-code.
* Renames GGML_CUDA_MIN_CC_DP4A to GGML_CUDA_CC_DP4A.
* Renames the rest of the compute capability macros for consistency.
- ggml/src/ggml-common.h +1 -1
- ggml/src/ggml-cuda/common.cuh +35 -35
- ggml/src/ggml-cuda/convert.cu +3 -3
- ggml/src/ggml-cuda/fattn.cu +1 -1
- ggml/src/ggml-cuda/ggml-cuda.cu +6 -6
- ggml/src/ggml-cuda/mma.cuh +4 -4
- ggml/src/ggml-cuda/mmq.cu +5 -5
- ggml/src/ggml-cuda/mmq.cuh +13 -13
- ggml/src/ggml-cuda/mmvq.cu +1 -1
- ggml/src/ggml-cuda/sum.cu +0 -2
ggml/src/ggml-common.h
CHANGED
|
@@ -473,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
|
|
| 473 |
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
| 474 |
GGML_TABLE_END()
|
| 475 |
|
| 476 |
-
//#if __CUDA_ARCH__ >=
|
| 477 |
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
|
| 478 |
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
| 479 |
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
|
|
|
| 473 |
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
| 474 |
GGML_TABLE_END()
|
| 475 |
|
| 476 |
+
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
|
| 477 |
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
|
| 478 |
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
| 479 |
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -41,28 +41,28 @@
|
|
| 41 |
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
| 42 |
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
|
| 43 |
|
| 44 |
-
#define
|
| 45 |
-
#define
|
| 46 |
-
#define
|
| 47 |
-
#define
|
| 48 |
-
#define
|
| 49 |
-
#define
|
| 50 |
|
| 51 |
// GCN/CNDA, wave size is 64
|
| 52 |
-
#define
|
| 53 |
-
#define
|
| 54 |
-
#define
|
| 55 |
-
#define
|
| 56 |
-
#define
|
| 57 |
-
#define
|
| 58 |
|
| 59 |
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
| 60 |
-
#define
|
| 61 |
-
#define
|
| 62 |
-
#define
|
| 63 |
|
| 64 |
-
#define
|
| 65 |
-
#define
|
| 66 |
|
| 67 |
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
| 68 |
|
|
@@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float
|
|
| 131 |
typedef float2 dfloat2;
|
| 132 |
#endif // GGML_CUDA_F16
|
| 133 |
|
| 134 |
-
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >=
|
| 135 |
#define FP16_AVAILABLE
|
| 136 |
-
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >=
|
| 137 |
|
| 138 |
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
| 139 |
#define FAST_FP16_AVAILABLE
|
| 140 |
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
| 141 |
|
| 142 |
-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
|
| 143 |
#define FP16_MMA_AVAILABLE
|
| 144 |
-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
|
| 145 |
|
| 146 |
-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
|
| 147 |
#define INT8_MMA_AVAILABLE
|
| 148 |
-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
|
| 149 |
|
| 150 |
-
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <=
|
| 151 |
#define FLASH_ATTN_AVAILABLE
|
| 152 |
-
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <=
|
| 153 |
|
| 154 |
static constexpr bool fast_fp16_available(const int cc) {
|
| 155 |
-
return cc >=
|
| 156 |
}
|
| 157 |
|
| 158 |
static constexpr bool fp16_mma_available(const int cc) {
|
| 159 |
-
return cc <
|
| 160 |
}
|
| 161 |
|
| 162 |
static constexpr bool int8_mma_available(const int cc) {
|
| 163 |
-
return cc <
|
| 164 |
}
|
| 165 |
|
| 166 |
[[noreturn]]
|
|
@@ -187,7 +187,7 @@ static __device__ void no_device_code(
|
|
| 187 |
#endif // __CUDA_ARCH__
|
| 188 |
|
| 189 |
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
| 190 |
-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
|
| 191 |
return __reduce_add_sync(0xffffffff, x);
|
| 192 |
#else
|
| 193 |
#pragma unroll
|
|
@@ -195,7 +195,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
|
| 195 |
x += __shfl_xor_sync(0xffffffff, x, offset, 32);
|
| 196 |
}
|
| 197 |
return x;
|
| 198 |
-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
|
| 199 |
}
|
| 200 |
|
| 201 |
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
|
@@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
|
|
| 284 |
}
|
| 285 |
|
| 286 |
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 287 |
-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
|
| 288 |
#pragma unroll
|
| 289 |
for (int offset = 16; offset > 0; offset >>= 1) {
|
| 290 |
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
|
|
@@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|
| 293 |
#else
|
| 294 |
GGML_UNUSED(x);
|
| 295 |
NO_DEVICE_CODE;
|
| 296 |
-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >=
|
| 297 |
}
|
| 298 |
|
| 299 |
#if CUDART_VERSION < CUDART_HMASK
|
|
@@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|
| 333 |
|
| 334 |
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 335 |
|
| 336 |
-
#if __CUDA_ARCH__ >=
|
| 337 |
return __dp4a(a, b, c);
|
| 338 |
-
#else // __CUDA_ARCH__ >=
|
| 339 |
const int8_t * a8 = (const int8_t *) &a;
|
| 340 |
const int8_t * b8 = (const int8_t *) &b;
|
| 341 |
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
| 342 |
-
#endif // __CUDA_ARCH__ >=
|
| 343 |
|
| 344 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 345 |
}
|
|
|
|
| 41 |
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
| 42 |
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
|
| 43 |
|
| 44 |
+
#define GGML_CUDA_CC_PASCAL 600
|
| 45 |
+
#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
| 46 |
+
#define GGML_CUDA_CC_VOLTA 700
|
| 47 |
+
#define GGML_CUDA_CC_TURING 750
|
| 48 |
+
#define GGML_CUDA_CC_AMPERE 800
|
| 49 |
+
#define GGML_CUDA_CC_OFFSET_AMD 1000000
|
| 50 |
|
| 51 |
// GCN/CNDA, wave size is 64
|
| 52 |
+
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
| 53 |
+
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
|
| 54 |
+
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
|
| 55 |
+
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
|
| 56 |
+
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
|
| 57 |
+
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
|
| 58 |
|
| 59 |
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
| 60 |
+
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
|
| 61 |
+
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
|
| 62 |
+
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
|
| 63 |
|
| 64 |
+
#define GGML_CUDA_CC_QY1 210
|
| 65 |
+
#define GGML_CUDA_CC_QY2 220
|
| 66 |
|
| 67 |
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
| 68 |
|
|
|
|
| 131 |
typedef float2 dfloat2;
|
| 132 |
#endif // GGML_CUDA_F16
|
| 133 |
|
| 134 |
+
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
| 135 |
#define FP16_AVAILABLE
|
| 136 |
+
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
| 137 |
|
| 138 |
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
| 139 |
#define FAST_FP16_AVAILABLE
|
| 140 |
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
| 141 |
|
| 142 |
+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 143 |
#define FP16_MMA_AVAILABLE
|
| 144 |
+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 145 |
|
| 146 |
+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
| 147 |
#define INT8_MMA_AVAILABLE
|
| 148 |
+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
| 149 |
|
| 150 |
+
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
| 151 |
#define FLASH_ATTN_AVAILABLE
|
| 152 |
+
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
| 153 |
|
| 154 |
static constexpr bool fast_fp16_available(const int cc) {
|
| 155 |
+
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
|
| 156 |
}
|
| 157 |
|
| 158 |
static constexpr bool fp16_mma_available(const int cc) {
|
| 159 |
+
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
|
| 160 |
}
|
| 161 |
|
| 162 |
static constexpr bool int8_mma_available(const int cc) {
|
| 163 |
+
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
|
| 164 |
}
|
| 165 |
|
| 166 |
[[noreturn]]
|
|
|
|
| 187 |
#endif // __CUDA_ARCH__
|
| 188 |
|
| 189 |
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
| 190 |
+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 191 |
return __reduce_add_sync(0xffffffff, x);
|
| 192 |
#else
|
| 193 |
#pragma unroll
|
|
|
|
| 195 |
x += __shfl_xor_sync(0xffffffff, x, offset, 32);
|
| 196 |
}
|
| 197 |
return x;
|
| 198 |
+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 199 |
}
|
| 200 |
|
| 201 |
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
|
|
|
| 284 |
}
|
| 285 |
|
| 286 |
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 287 |
+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
| 288 |
#pragma unroll
|
| 289 |
for (int offset = 16; offset > 0; offset >>= 1) {
|
| 290 |
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
|
|
|
|
| 293 |
#else
|
| 294 |
GGML_UNUSED(x);
|
| 295 |
NO_DEVICE_CODE;
|
| 296 |
+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
| 297 |
}
|
| 298 |
|
| 299 |
#if CUDART_VERSION < CUDART_HMASK
|
|
|
|
| 333 |
|
| 334 |
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 335 |
|
| 336 |
+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
| 337 |
return __dp4a(a, b, c);
|
| 338 |
+
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
| 339 |
const int8_t * a8 = (const int8_t *) &a;
|
| 340 |
const int8_t * b8 = (const int8_t *) &b;
|
| 341 |
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
| 342 |
+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
| 343 |
|
| 344 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 345 |
}
|
ggml/src/ggml-cuda/convert.cu
CHANGED
|
@@ -26,7 +26,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
|
| 26 |
|
| 27 |
template <bool need_check>
|
| 28 |
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
|
| 29 |
-
#if __CUDA_ARCH__ >=
|
| 30 |
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
| 31 |
|
| 32 |
const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
|
@@ -64,7 +64,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
|
| 64 |
GGML_UNUSED(y);
|
| 65 |
GGML_UNUSED(k);
|
| 66 |
NO_DEVICE_CODE;
|
| 67 |
-
#endif // __CUDA_ARCH__ >=
|
| 68 |
}
|
| 69 |
|
| 70 |
template<typename dst_t>
|
|
@@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
| 599 |
case GGML_TYPE_Q5_1:
|
| 600 |
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
| 601 |
case GGML_TYPE_Q8_0:
|
| 602 |
-
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >=
|
| 603 |
return dequantize_block_q8_0_f16_cuda;
|
| 604 |
}
|
| 605 |
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
|
|
|
| 26 |
|
| 27 |
template <bool need_check>
|
| 28 |
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
|
| 29 |
+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
| 30 |
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
| 31 |
|
| 32 |
const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
|
|
|
| 64 |
GGML_UNUSED(y);
|
| 65 |
GGML_UNUSED(k);
|
| 66 |
NO_DEVICE_CODE;
|
| 67 |
+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
| 68 |
}
|
| 69 |
|
| 70 |
template<typename dst_t>
|
|
|
|
| 599 |
case GGML_TYPE_Q5_1:
|
| 600 |
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
| 601 |
case GGML_TYPE_Q8_0:
|
| 602 |
+
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) {
|
| 603 |
return dequantize_block_q8_0_f16_cuda;
|
| 604 |
}
|
| 605 |
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
ggml/src/ggml-cuda/fattn.cu
CHANGED
|
@@ -304,7 +304,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
|
| 304 |
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
| 305 |
|
| 306 |
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
| 307 |
-
if (cc >=
|
| 308 |
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
| 309 |
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
| 310 |
} else {
|
|
|
|
| 304 |
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
| 305 |
|
| 306 |
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
| 307 |
+
if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
|
| 308 |
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
| 309 |
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
| 310 |
} else {
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -177,7 +177,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 177 |
info.devices[id].smpb = prop.sharedMemPerBlock;
|
| 178 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 179 |
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
| 180 |
-
info.devices[id].cc = 100*prop.major + 10*prop.minor +
|
| 181 |
#else
|
| 182 |
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
| 183 |
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
|
@@ -1081,7 +1081,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
| 1081 |
|
| 1082 |
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
| 1083 |
|
| 1084 |
-
if (compute_capability >=
|
| 1085 |
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
| 1086 |
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
| 1087 |
if (src0->type != GGML_TYPE_F16) {
|
|
@@ -1108,7 +1108,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
| 1108 |
const half beta_f16 = 0.0f;
|
| 1109 |
|
| 1110 |
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
| 1111 |
-
if (ggml_cuda_info().devices[ctx.device].cc ==
|
| 1112 |
cu_compute_type = CUBLAS_COMPUTE_32F;
|
| 1113 |
}
|
| 1114 |
|
|
@@ -1612,7 +1612,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
| 1612 |
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
| 1613 |
cudaDataType_t cu_data_type = CUDA_R_16F;
|
| 1614 |
|
| 1615 |
-
if (ggml_cuda_info().devices[ctx.device].cc ==
|
| 1616 |
cu_compute_type = CUBLAS_COMPUTE_32F;
|
| 1617 |
}
|
| 1618 |
|
|
@@ -2357,7 +2357,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2357 |
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
| 2358 |
|
| 2359 |
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
| 2360 |
-
if (ggml_cuda_info().devices[cuda_ctx->device].cc <
|
| 2361 |
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
| 2362 |
#ifndef NDEBUG
|
| 2363 |
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
|
@@ -3028,7 +3028,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3028 |
return true;
|
| 3029 |
}
|
| 3030 |
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
| 3031 |
-
return cc >=
|
| 3032 |
}
|
| 3033 |
case GGML_OP_CROSS_ENTROPY_LOSS:
|
| 3034 |
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
|
|
|
| 177 |
info.devices[id].smpb = prop.sharedMemPerBlock;
|
| 178 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 179 |
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
| 180 |
+
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
|
| 181 |
#else
|
| 182 |
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
| 183 |
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
|
|
|
| 1081 |
|
| 1082 |
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
| 1083 |
|
| 1084 |
+
if (compute_capability >= GGML_CUDA_CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 1085 |
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
| 1086 |
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
| 1087 |
if (src0->type != GGML_TYPE_F16) {
|
|
|
|
| 1108 |
const half beta_f16 = 0.0f;
|
| 1109 |
|
| 1110 |
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
| 1111 |
+
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
|
| 1112 |
cu_compute_type = CUBLAS_COMPUTE_32F;
|
| 1113 |
}
|
| 1114 |
|
|
|
|
| 1612 |
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
| 1613 |
cudaDataType_t cu_data_type = CUDA_R_16F;
|
| 1614 |
|
| 1615 |
+
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
|
| 1616 |
cu_compute_type = CUBLAS_COMPUTE_32F;
|
| 1617 |
}
|
| 1618 |
|
|
|
|
| 2357 |
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
| 2358 |
|
| 2359 |
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
| 2360 |
+
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
|
| 2361 |
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
| 2362 |
#ifndef NDEBUG
|
| 2363 |
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
|
|
|
| 3028 |
return true;
|
| 3029 |
}
|
| 3030 |
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
| 3031 |
+
return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
| 3032 |
}
|
| 3033 |
case GGML_OP_CROSS_ENTROPY_LOSS:
|
| 3034 |
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
ggml/src/ggml-cuda/mma.cuh
CHANGED
|
@@ -171,7 +171,7 @@ struct mma_int_C_I16J8 {
|
|
| 171 |
|
| 172 |
__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
|
| 173 |
#ifdef INT8_MMA_AVAILABLE
|
| 174 |
-
#if __CUDA_ARCH__ >=
|
| 175 |
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
|
| 176 |
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
| 177 |
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
|
@@ -183,7 +183,7 @@ struct mma_int_C_I16J8 {
|
|
| 183 |
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
| 184 |
: "+r"(x[2]), "+r"(x[3])
|
| 185 |
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
| 186 |
-
#endif // __CUDA_ARCH__ >=
|
| 187 |
#else
|
| 188 |
GGML_UNUSED(mma_A);
|
| 189 |
GGML_UNUSED(mma_B);
|
|
@@ -193,7 +193,7 @@ struct mma_int_C_I16J8 {
|
|
| 193 |
|
| 194 |
__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
|
| 195 |
#ifdef INT8_MMA_AVAILABLE
|
| 196 |
-
#if __CUDA_ARCH__ >=
|
| 197 |
asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
|
| 198 |
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
| 199 |
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
|
|
@@ -211,7 +211,7 @@ struct mma_int_C_I16J8 {
|
|
| 211 |
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
| 212 |
: "+r"(x[2]), "+r"(x[3])
|
| 213 |
: "r"(mma_A.x[3]), "r"(mma_B.x[1]));
|
| 214 |
-
#endif // __CUDA_ARCH__ >=
|
| 215 |
#else
|
| 216 |
GGML_UNUSED(mma_A);
|
| 217 |
GGML_UNUSED(mma_B);
|
|
|
|
| 171 |
|
| 172 |
__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
|
| 173 |
#ifdef INT8_MMA_AVAILABLE
|
| 174 |
+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 175 |
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
|
| 176 |
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
| 177 |
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
|
|
|
| 183 |
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
| 184 |
: "+r"(x[2]), "+r"(x[3])
|
| 185 |
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
| 186 |
+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 187 |
#else
|
| 188 |
GGML_UNUSED(mma_A);
|
| 189 |
GGML_UNUSED(mma_B);
|
|
|
|
| 193 |
|
| 194 |
__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
|
| 195 |
#ifdef INT8_MMA_AVAILABLE
|
| 196 |
+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 197 |
asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
|
| 198 |
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
| 199 |
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
|
|
|
|
| 211 |
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
| 212 |
: "+r"(x[2]), "+r"(x[3])
|
| 213 |
: "r"(mma_A.x[3]), "r"(mma_B.x[1]));
|
| 214 |
+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 215 |
#else
|
| 216 |
GGML_UNUSED(mma_A);
|
| 217 |
GGML_UNUSED(mma_B);
|
ggml/src/ggml-cuda/mmq.cu
CHANGED
|
@@ -27,7 +27,7 @@ void ggml_cuda_op_mul_mat_q(
|
|
| 27 |
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
| 28 |
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
| 29 |
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
| 30 |
-
const bool use_stream_k = compute_capability >=
|
| 31 |
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
|
| 32 |
|
| 33 |
switch (src0->type) {
|
|
@@ -136,7 +136,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
| 136 |
return true;
|
| 137 |
}
|
| 138 |
|
| 139 |
-
if (cc <
|
| 140 |
return false;
|
| 141 |
}
|
| 142 |
|
|
@@ -144,9 +144,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
| 144 |
return true;
|
| 145 |
#endif //GGML_CUDA_FORCE_MMQ
|
| 146 |
|
| 147 |
-
if (cc <
|
| 148 |
-
return cc <
|
| 149 |
}
|
| 150 |
|
| 151 |
-
return (cc <
|
| 152 |
}
|
|
|
|
| 27 |
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
| 28 |
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
| 29 |
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
| 30 |
+
const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
|
| 31 |
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
|
| 32 |
|
| 33 |
switch (src0->type) {
|
|
|
|
| 136 |
return true;
|
| 137 |
}
|
| 138 |
|
| 139 |
+
if (cc < GGML_CUDA_CC_DP4A) {
|
| 140 |
return false;
|
| 141 |
}
|
| 142 |
|
|
|
|
| 144 |
return true;
|
| 145 |
#endif //GGML_CUDA_FORCE_MMQ
|
| 146 |
|
| 147 |
+
if (cc < GGML_CUDA_CC_OFFSET_AMD) {
|
| 148 |
+
return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 149 |
}
|
| 150 |
|
| 151 |
+
return (cc < GGML_CUDA_CC_RDNA3 && cc != GGML_CUDA_CC_CDNA && cc != GGML_CUDA_CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 152 |
}
|
ggml/src/ggml-cuda/mmq.cuh
CHANGED
|
@@ -89,9 +89,9 @@ struct tile_x_sizes {
|
|
| 89 |
static constexpr int get_mmq_x_max_host(const int cc) {
|
| 90 |
return int8_mma_available(cc) ? 128 :
|
| 91 |
#ifdef GGML_CUDA_FORCE_MMQ
|
| 92 |
-
cc >=
|
| 93 |
#else
|
| 94 |
-
cc >=
|
| 95 |
#endif // GGML_CUDA_FORCE_MMQ
|
| 96 |
}
|
| 97 |
|
|
@@ -104,23 +104,23 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
|
| 104 |
return 128;
|
| 105 |
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 106 |
|
| 107 |
-
#if __CUDA_ARCH__ >=
|
| 108 |
#ifdef GGML_CUDA_FORCE_MMQ
|
| 109 |
return MMQ_DP4A_MAX_BATCH_SIZE;
|
| 110 |
#else // GGML_CUDA_FORCE_MMQ
|
| 111 |
return 128;
|
| 112 |
#endif // GGML_CUDA_FORCE_MMQ
|
| 113 |
-
#else // __CUDA_ARCH__ >=
|
| 114 |
|
| 115 |
return 64;
|
| 116 |
-
#endif // __CUDA_ARCH__ >=
|
| 117 |
|
| 118 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 119 |
#endif // INT8_MMA_AVAILABLE
|
| 120 |
}
|
| 121 |
|
| 122 |
static constexpr int get_mmq_y_host(const int cc) {
|
| 123 |
-
return cc >=
|
| 124 |
}
|
| 125 |
|
| 126 |
static constexpr __device__ int get_mmq_y_device() {
|
|
@@ -131,11 +131,11 @@ static constexpr __device__ int get_mmq_y_device() {
|
|
| 131 |
return 128;
|
| 132 |
#endif // defined RDNA1
|
| 133 |
#else
|
| 134 |
-
#if __CUDA_ARCH__ >=
|
| 135 |
return 128;
|
| 136 |
#else
|
| 137 |
return 64;
|
| 138 |
-
#endif // __CUDA_ARCH__ >=
|
| 139 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 140 |
}
|
| 141 |
|
|
@@ -2574,11 +2574,11 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
|
| 2574 |
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
| 2575 |
#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
| 2576 |
#else
|
| 2577 |
-
#if __CUDA_ARCH__ >=
|
| 2578 |
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
| 2579 |
#else
|
| 2580 |
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
| 2581 |
-
#endif // __CUDA_ARCH__ >=
|
| 2582 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 2583 |
static __global__ void mul_mat_q(
|
| 2584 |
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
|
@@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q(
|
|
| 2594 |
constexpr int mmq_y = get_mmq_y_device();
|
| 2595 |
|
| 2596 |
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
| 2597 |
-
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ <
|
| 2598 |
{
|
| 2599 |
constexpr bool fixup = false;
|
| 2600 |
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
|
@@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q(
|
|
| 2602 |
blockIdx.x, blockIdx.y, 0, ne00/qk);
|
| 2603 |
return;
|
| 2604 |
}
|
| 2605 |
-
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ <
|
| 2606 |
|
| 2607 |
const int64_t blocks_per_ne00 = ne00 / qk;
|
| 2608 |
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
|
@@ -2825,7 +2825,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
|
| 2825 |
const int mmq_x_max = get_mmq_x_max_host(cc);
|
| 2826 |
const int mmq_y = get_mmq_y_host(cc);
|
| 2827 |
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
|
| 2828 |
-
const bool use_stream_k = cc >=
|
| 2829 |
|
| 2830 |
int mmq_x_best = 0;
|
| 2831 |
int nparts_best = INT_MAX;
|
|
|
|
| 89 |
static constexpr int get_mmq_x_max_host(const int cc) {
|
| 90 |
return int8_mma_available(cc) ? 128 :
|
| 91 |
#ifdef GGML_CUDA_FORCE_MMQ
|
| 92 |
+
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;
|
| 93 |
#else
|
| 94 |
+
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
|
| 95 |
#endif // GGML_CUDA_FORCE_MMQ
|
| 96 |
}
|
| 97 |
|
|
|
|
| 104 |
return 128;
|
| 105 |
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 106 |
|
| 107 |
+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 108 |
#ifdef GGML_CUDA_FORCE_MMQ
|
| 109 |
return MMQ_DP4A_MAX_BATCH_SIZE;
|
| 110 |
#else // GGML_CUDA_FORCE_MMQ
|
| 111 |
return 128;
|
| 112 |
#endif // GGML_CUDA_FORCE_MMQ
|
| 113 |
+
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 114 |
|
| 115 |
return 64;
|
| 116 |
+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 117 |
|
| 118 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 119 |
#endif // INT8_MMA_AVAILABLE
|
| 120 |
}
|
| 121 |
|
| 122 |
static constexpr int get_mmq_y_host(const int cc) {
|
| 123 |
+
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (cc == GGML_CUDA_CC_RDNA1 ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
|
| 124 |
}
|
| 125 |
|
| 126 |
static constexpr __device__ int get_mmq_y_device() {
|
|
|
|
| 131 |
return 128;
|
| 132 |
#endif // defined RDNA1
|
| 133 |
#else
|
| 134 |
+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 135 |
return 128;
|
| 136 |
#else
|
| 137 |
return 64;
|
| 138 |
+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 139 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 140 |
}
|
| 141 |
|
|
|
|
| 2574 |
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
| 2575 |
#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
| 2576 |
#else
|
| 2577 |
+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 2578 |
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
| 2579 |
#else
|
| 2580 |
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
| 2581 |
+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 2582 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 2583 |
static __global__ void mul_mat_q(
|
| 2584 |
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
|
|
|
| 2594 |
constexpr int mmq_y = get_mmq_y_device();
|
| 2595 |
|
| 2596 |
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
| 2597 |
+
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
| 2598 |
{
|
| 2599 |
constexpr bool fixup = false;
|
| 2600 |
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
|
|
|
| 2602 |
blockIdx.x, blockIdx.y, 0, ne00/qk);
|
| 2603 |
return;
|
| 2604 |
}
|
| 2605 |
+
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
| 2606 |
|
| 2607 |
const int64_t blocks_per_ne00 = ne00 / qk;
|
| 2608 |
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
|
|
|
| 2825 |
const int mmq_x_max = get_mmq_x_max_host(cc);
|
| 2826 |
const int mmq_y = get_mmq_y_host(cc);
|
| 2827 |
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
|
| 2828 |
+
const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
|
| 2829 |
|
| 2830 |
int mmq_x_best = 0;
|
| 2831 |
int nparts_best = INT_MAX;
|
ggml/src/ggml-cuda/mmvq.cu
CHANGED
|
@@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
|
|
| 142 |
int64_t nwarps = 1;
|
| 143 |
int64_t rows_per_cuda_block = 1;
|
| 144 |
|
| 145 |
-
if (ggml_cuda_info().devices[id].cc <
|
| 146 |
switch(ncols_y) {
|
| 147 |
case 1:
|
| 148 |
nwarps = 4;
|
|
|
|
| 142 |
int64_t nwarps = 1;
|
| 143 |
int64_t rows_per_cuda_block = 1;
|
| 144 |
|
| 145 |
+
if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
|
| 146 |
switch(ncols_y) {
|
| 147 |
case 1:
|
| 148 |
nwarps = 4;
|
ggml/src/ggml-cuda/sum.cu
CHANGED
|
@@ -3,8 +3,6 @@
|
|
| 3 |
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
| 4 |
|
| 5 |
#ifdef USE_CUB
|
| 6 |
-
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
|
| 7 |
-
// For this reason CUB must be included BEFORE anything else.
|
| 8 |
#include <cub/cub.cuh>
|
| 9 |
using namespace cub;
|
| 10 |
#endif // USE_CUB
|
|
|
|
| 3 |
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
| 4 |
|
| 5 |
#ifdef USE_CUB
|
|
|
|
|
|
|
| 6 |
#include <cub/cub.cuh>
|
| 7 |
using namespace cub;
|
| 8 |
#endif // USE_CUB
|