Slobodan Josic commited on
Commit
a73f01f
·
1 Parent(s): 71d72f9

HIP: Add support for RDNA4 targets (llama/12372)

Browse files
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -52,7 +52,7 @@
52
  #define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
53
 
54
  // AMD
55
- // GCN/CNDA, wave size is 64
56
  #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
57
  #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
58
  #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
@@ -60,16 +60,18 @@
60
  #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
61
  #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
62
 
63
- // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
64
  #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
65
  #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
66
  #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
 
67
 
68
  #define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
69
  #define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
70
  #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
71
  #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
72
- #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
 
73
  #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
74
  #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
75
 
@@ -209,9 +211,9 @@ typedef float2 dfloat2;
209
  #define FP16_MMA_AVAILABLE
210
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
211
 
212
- #if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
213
  #define FP16_MMA_AVAILABLE
214
- #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
215
 
216
  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
217
  #define NEW_MMA_AVAILABLE
@@ -244,14 +246,14 @@ static bool fp16_mma_available(const int cc) {
244
  return false;
245
  #else
246
  return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
247
- GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
248
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
249
  }
250
 
251
  // To be used for feature selection of external libraries, e.g. cuBLAS.
252
  static bool fp16_mma_hardware_available(const int cc) {
253
  return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
254
- GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
255
  }
256
 
257
  // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
@@ -409,7 +411,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
409
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
410
  #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
411
  c = __builtin_amdgcn_sdot4(a, b, c, false);
412
- #elif defined(RDNA3)
413
  c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
414
  #elif defined(RDNA1) || defined(__gfx900__)
415
  int tmp1;
 
52
  #define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
53
 
54
  // AMD
55
+ // GCN/CDNA, wave size is 64
56
  #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
57
  #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
58
  #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
 
60
  #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
61
  #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
62
 
63
+ // RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
64
  #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
65
  #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
66
  #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
67
+ #define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
68
 
69
  #define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
70
  #define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
71
  #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
72
  #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
73
+ #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
74
+ #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
75
  #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
76
  #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
77
 
 
211
  #define FP16_MMA_AVAILABLE
212
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
213
 
214
+ #if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
215
  #define FP16_MMA_AVAILABLE
216
+ #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
217
 
218
  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
219
  #define NEW_MMA_AVAILABLE
 
246
  return false;
247
  #else
248
  return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
249
+ GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
250
  #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
251
  }
252
 
253
  // To be used for feature selection of external libraries, e.g. cuBLAS.
254
  static bool fp16_mma_hardware_available(const int cc) {
255
  return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
256
+ GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
257
  }
258
 
259
  // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
 
411
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
412
  #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
413
  c = __builtin_amdgcn_sdot4(a, b, c, false);
414
+ #elif defined(RDNA3) || defined(RDNA4)
415
  c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
416
  #elif defined(RDNA1) || defined(__gfx900__)
417
  int tmp1;
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -1216,7 +1216,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1216
 
1217
  CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1218
 
1219
- if (GGML_CUDA_CC_IS_CDNA(cc)) {
1220
  const float alpha = 1.0f;
1221
  const float beta = 0.0f;
1222
  CUBLAS_CHECK(
@@ -1759,7 +1759,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1759
  beta = &beta_f32;
1760
  }
1761
 
1762
- if (GGML_CUDA_CC_IS_CDNA(ggml_cuda_info().devices[ctx.device].cc)) {
 
 
1763
  cu_compute_type = CUBLAS_COMPUTE_32F;
1764
  alpha = &alpha_f32;
1765
  beta = &beta_f32;
@@ -1836,7 +1838,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1836
  }
1837
  #endif
1838
 
1839
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
1840
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1841
  to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
1842
  }
 
1216
 
1217
  CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1218
 
1219
+ if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
1220
  const float alpha = 1.0f;
1221
  const float beta = 0.0f;
1222
  CUBLAS_CHECK(
 
1759
  beta = &beta_f32;
1760
  }
1761
 
1762
+ int id = ggml_cuda_get_device();
1763
+ const int cc = ggml_cuda_info().devices[id].cc;
1764
+ if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
1765
  cu_compute_type = CUBLAS_COMPUTE_32F;
1766
  alpha = &alpha_f32;
1767
  beta = &beta_f32;
 
1838
  }
1839
  #endif
1840
 
1841
+ if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type == CUDA_R_16F) {
1842
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1843
  to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
1844
  }
ggml/src/ggml-cuda/mmq.cu CHANGED
@@ -149,5 +149,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
149
  return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
150
  }
151
 
152
- return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
153
  }
 
149
  return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
150
  }
151
 
152
+ return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
153
  }
ggml/src/ggml-cuda/mmq.cuh CHANGED
@@ -2577,9 +2577,9 @@ static __device__ void mul_mat_q_process_tile(
2577
 
2578
  template <ggml_type type, int mmq_x, int nwarps, bool need_check>
2579
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
2580
- #if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
2581
  __launch_bounds__(WARP_SIZE*nwarps, 2)
2582
- #endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
2583
  #else
2584
  #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
2585
  __launch_bounds__(WARP_SIZE*nwarps, 1)
 
2577
 
2578
  template <ggml_type type, int mmq_x, int nwarps, bool need_check>
2579
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
2580
+ #if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
2581
  __launch_bounds__(WARP_SIZE*nwarps, 2)
2582
+ #endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
2583
  #else
2584
  #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
2585
  __launch_bounds__(WARP_SIZE*nwarps, 1)
ggml/src/ggml-cuda/mmvq.cu CHANGED
@@ -54,7 +54,7 @@ enum mmvq_parameter_table_id {
54
  };
55
 
56
  static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
57
- #if defined(RDNA2) || defined(RDNA3)
58
  return MMVQ_PARAMETERS_RDNA2;
59
  #elif defined(GCN) || defined(CDNA)
60
  return MMVQ_PARAMETERS_GCN;
@@ -64,7 +64,7 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
64
  }
65
 
66
  static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
67
- if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
68
  return MMVQ_PARAMETERS_RDNA2;
69
  }
70
  if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
 
54
  };
55
 
56
  static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
57
+ #if defined(RDNA2) || defined(RDNA3) || defined(RDNA4)
58
  return MMVQ_PARAMETERS_RDNA2;
59
  #elif defined(GCN) || defined(CDNA)
60
  return MMVQ_PARAMETERS_GCN;
 
64
  }
65
 
66
  static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
67
+ if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
68
  return MMVQ_PARAMETERS_RDNA2;
69
  }
70
  if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
ggml/src/ggml-cuda/vendors/hip.h CHANGED
@@ -151,6 +151,10 @@
151
  #define CDNA
152
  #endif
153
 
 
 
 
 
154
  #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
155
  defined(__gfx1150__) || defined(__gfx1151__)
156
  #define RDNA3
 
151
  #define CDNA
152
  #endif
153
 
154
+ #if defined(__GFX12__)
155
+ #define RDNA4
156
+ #endif
157
+
158
  #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
159
  defined(__gfx1150__) || defined(__gfx1151__)
160
  #define RDNA3