Spaces:
Running
Running
Slobodan Josic
commited on
Commit
·
a73f01f
1
Parent(s):
71d72f9
HIP: Add support for RDNA4 targets (llama/12372)
Browse files
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -52,7 +52,7 @@
|
|
| 52 |
#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
|
| 53 |
|
| 54 |
// AMD
|
| 55 |
-
// GCN/
|
| 56 |
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
| 57 |
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
| 58 |
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
|
@@ -60,16 +60,18 @@
|
|
| 60 |
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
| 61 |
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
| 62 |
|
| 63 |
-
//
|
| 64 |
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
|
| 65 |
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
| 66 |
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
|
|
|
| 67 |
|
| 68 |
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
|
| 69 |
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
| 70 |
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
| 71 |
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
| 72 |
-
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
|
|
|
|
| 73 |
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
|
| 74 |
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
| 75 |
|
|
@@ -209,9 +211,9 @@ typedef float2 dfloat2;
|
|
| 209 |
#define FP16_MMA_AVAILABLE
|
| 210 |
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 211 |
|
| 212 |
-
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
|
| 213 |
#define FP16_MMA_AVAILABLE
|
| 214 |
-
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
|
| 215 |
|
| 216 |
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
| 217 |
#define NEW_MMA_AVAILABLE
|
|
@@ -244,14 +246,14 @@ static bool fp16_mma_available(const int cc) {
|
|
| 244 |
return false;
|
| 245 |
#else
|
| 246 |
return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
| 247 |
-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
|
| 248 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
| 249 |
}
|
| 250 |
|
| 251 |
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
| 252 |
static bool fp16_mma_hardware_available(const int cc) {
|
| 253 |
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
|
| 254 |
-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
|
| 255 |
}
|
| 256 |
|
| 257 |
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
|
@@ -409,7 +411,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|
| 409 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 410 |
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
|
| 411 |
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
| 412 |
-
#elif defined(RDNA3)
|
| 413 |
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
| 414 |
#elif defined(RDNA1) || defined(__gfx900__)
|
| 415 |
int tmp1;
|
|
|
|
| 52 |
#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
|
| 53 |
|
| 54 |
// AMD
|
| 55 |
+
// GCN/CDNA, wave size is 64
|
| 56 |
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
| 57 |
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
| 58 |
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
|
|
|
| 60 |
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
| 61 |
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
| 62 |
|
| 63 |
+
// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
| 64 |
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
|
| 65 |
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
| 66 |
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
| 67 |
+
#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
|
| 68 |
|
| 69 |
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
|
| 70 |
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
| 71 |
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
| 72 |
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
| 73 |
+
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
|
| 74 |
+
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
| 75 |
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
|
| 76 |
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
| 77 |
|
|
|
|
| 211 |
#define FP16_MMA_AVAILABLE
|
| 212 |
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 213 |
|
| 214 |
+
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
| 215 |
#define FP16_MMA_AVAILABLE
|
| 216 |
+
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
| 217 |
|
| 218 |
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
| 219 |
#define NEW_MMA_AVAILABLE
|
|
|
|
| 246 |
return false;
|
| 247 |
#else
|
| 248 |
return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
| 249 |
+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
|
| 250 |
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
| 251 |
}
|
| 252 |
|
| 253 |
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
| 254 |
static bool fp16_mma_hardware_available(const int cc) {
|
| 255 |
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
|
| 256 |
+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
|
| 257 |
}
|
| 258 |
|
| 259 |
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
|
|
|
| 411 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 412 |
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
|
| 413 |
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
| 414 |
+
#elif defined(RDNA3) || defined(RDNA4)
|
| 415 |
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
| 416 |
#elif defined(RDNA1) || defined(__gfx900__)
|
| 417 |
int tmp1;
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -1216,7 +1216,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
| 1216 |
|
| 1217 |
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
| 1218 |
|
| 1219 |
-
if (GGML_CUDA_CC_IS_CDNA(cc)) {
|
| 1220 |
const float alpha = 1.0f;
|
| 1221 |
const float beta = 0.0f;
|
| 1222 |
CUBLAS_CHECK(
|
|
@@ -1759,7 +1759,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
| 1759 |
beta = &beta_f32;
|
| 1760 |
}
|
| 1761 |
|
| 1762 |
-
|
|
|
|
|
|
|
| 1763 |
cu_compute_type = CUBLAS_COMPUTE_32F;
|
| 1764 |
alpha = &alpha_f32;
|
| 1765 |
beta = &beta_f32;
|
|
@@ -1836,7 +1838,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
| 1836 |
}
|
| 1837 |
#endif
|
| 1838 |
|
| 1839 |
-
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
| 1840 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
| 1841 |
to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
| 1842 |
}
|
|
|
|
| 1216 |
|
| 1217 |
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
| 1218 |
|
| 1219 |
+
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
| 1220 |
const float alpha = 1.0f;
|
| 1221 |
const float beta = 0.0f;
|
| 1222 |
CUBLAS_CHECK(
|
|
|
|
| 1759 |
beta = &beta_f32;
|
| 1760 |
}
|
| 1761 |
|
| 1762 |
+
int id = ggml_cuda_get_device();
|
| 1763 |
+
const int cc = ggml_cuda_info().devices[id].cc;
|
| 1764 |
+
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
| 1765 |
cu_compute_type = CUBLAS_COMPUTE_32F;
|
| 1766 |
alpha = &alpha_f32;
|
| 1767 |
beta = &beta_f32;
|
|
|
|
| 1838 |
}
|
| 1839 |
#endif
|
| 1840 |
|
| 1841 |
+
if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type == CUDA_R_16F) {
|
| 1842 |
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
| 1843 |
to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
| 1844 |
}
|
ggml/src/ggml-cuda/mmq.cu
CHANGED
|
@@ -149,5 +149,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
| 149 |
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 150 |
}
|
| 151 |
|
| 152 |
-
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 153 |
}
|
|
|
|
| 149 |
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 150 |
}
|
| 151 |
|
| 152 |
+
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
| 153 |
}
|
ggml/src/ggml-cuda/mmq.cuh
CHANGED
|
@@ -2577,9 +2577,9 @@ static __device__ void mul_mat_q_process_tile(
|
|
| 2577 |
|
| 2578 |
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
| 2579 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 2580 |
-
#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
| 2581 |
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
| 2582 |
-
#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
| 2583 |
#else
|
| 2584 |
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 2585 |
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
|
|
|
| 2577 |
|
| 2578 |
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
| 2579 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 2580 |
+
#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
| 2581 |
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
| 2582 |
+
#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
| 2583 |
#else
|
| 2584 |
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 2585 |
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
ggml/src/ggml-cuda/mmvq.cu
CHANGED
|
@@ -54,7 +54,7 @@ enum mmvq_parameter_table_id {
|
|
| 54 |
};
|
| 55 |
|
| 56 |
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
| 57 |
-
#if defined(RDNA2) || defined(RDNA3)
|
| 58 |
return MMVQ_PARAMETERS_RDNA2;
|
| 59 |
#elif defined(GCN) || defined(CDNA)
|
| 60 |
return MMVQ_PARAMETERS_GCN;
|
|
@@ -64,7 +64,7 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
|
| 64 |
}
|
| 65 |
|
| 66 |
static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
| 67 |
-
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
|
| 68 |
return MMVQ_PARAMETERS_RDNA2;
|
| 69 |
}
|
| 70 |
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
|
|
|
| 54 |
};
|
| 55 |
|
| 56 |
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
| 57 |
+
#if defined(RDNA2) || defined(RDNA3) || defined(RDNA4)
|
| 58 |
return MMVQ_PARAMETERS_RDNA2;
|
| 59 |
#elif defined(GCN) || defined(CDNA)
|
| 60 |
return MMVQ_PARAMETERS_GCN;
|
|
|
|
| 64 |
}
|
| 65 |
|
| 66 |
static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
| 67 |
+
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
| 68 |
return MMVQ_PARAMETERS_RDNA2;
|
| 69 |
}
|
| 70 |
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
ggml/src/ggml-cuda/vendors/hip.h
CHANGED
|
@@ -151,6 +151,10 @@
|
|
| 151 |
#define CDNA
|
| 152 |
#endif
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
| 155 |
defined(__gfx1150__) || defined(__gfx1151__)
|
| 156 |
#define RDNA3
|
|
|
|
| 151 |
#define CDNA
|
| 152 |
#endif
|
| 153 |
|
| 154 |
+
#if defined(__GFX12__)
|
| 155 |
+
#define RDNA4
|
| 156 |
+
#endif
|
| 157 |
+
|
| 158 |
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
| 159 |
defined(__gfx1150__) || defined(__gfx1151__)
|
| 160 |
#define RDNA3
|