uvos commited on
Commit
e37eff3
·
1 Parent(s): f9dbd96

HIP: remove the use of __HIP_PLATFORM_AMD__, explicitly support only AMD targets (llama/14945)

Browse files
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -176,7 +176,7 @@ static const char * cu_get_error_str(CUresult err) {
176
  #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
177
  #endif
178
 
179
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
180
  # define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
181
  do { \
182
  static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
@@ -191,7 +191,7 @@ static const char * cu_get_error_str(CUresult err) {
191
  do { \
192
  GGML_UNUSED(nbytes); \
193
  } while (0)
194
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
195
 
196
  #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
197
  #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
@@ -211,9 +211,9 @@ typedef float2 dfloat2;
211
  #define GGML_USE_VMM
212
  #endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
213
 
214
- #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
215
  #define FP16_AVAILABLE
216
- #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
217
 
218
  #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
219
  #define FAST_FP16_AVAILABLE
@@ -227,17 +227,17 @@ typedef float2 dfloat2;
227
  #define FP16_MMA_AVAILABLE
228
  #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
229
 
230
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
231
  #define AMD_MFMA_AVAILABLE
232
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
233
 
234
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
235
  #define NEW_MMA_AVAILABLE
236
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
237
 
238
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
239
  #define CP_ASYNC_AVAILABLE
240
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
241
 
242
  #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
243
  #define FLASH_ATTN_AVAILABLE
@@ -259,7 +259,7 @@ static bool fast_fp16_hardware_available(const int cc) {
259
 
260
  // Any FP16 tensor core instructions are available for ggml code.
261
  static bool fp16_mma_available(const int cc) {
262
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
263
  return false;
264
  #else
265
  if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
@@ -275,7 +275,7 @@ static bool fp16_mma_available(const int cc) {
275
  } else {
276
  return false;
277
  }
278
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
279
  }
280
 
281
  // To be used for feature selection of external libraries, e.g. cuBLAS.
@@ -312,25 +312,25 @@ static bool cp_async_available(const int cc) {
312
  }
313
 
314
  static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
315
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
316
  return 64;
317
  #else
318
  return 32;
319
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
320
  }
321
 
322
  [[noreturn]]
323
  static __device__ void no_device_code(
324
  const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
325
 
326
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
327
  printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
328
  file_name, line, function_name, arch);
329
  GGML_UNUSED(arch_list);
330
  #else
331
  printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
332
  file_name, line, function_name, arch, arch_list);
333
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
334
  __trap();
335
 
336
  GGML_UNUSED(no_device_code); // suppress unused function warning
@@ -367,7 +367,7 @@ struct ggml_cuda_unroll<1> {
367
 
368
  template<int width = WARP_SIZE>
369
  static __device__ __forceinline__ int warp_reduce_sum(int x) {
370
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
371
  return __reduce_add_sync(0xffffffff, x);
372
  #else
373
  #pragma unroll
@@ -375,7 +375,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
375
  x += __shfl_xor_sync(0xffffffff, x, offset, width);
376
  }
377
  return x;
378
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
379
  }
380
 
381
  template<int width = WARP_SIZE>
@@ -444,11 +444,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
444
  static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
445
  #ifdef FP16_AVAILABLE
446
 
447
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
448
  return __float2half(fmaxf(__half2float(a), __half2float(b)));
449
  #else
450
  return __hmax(a, b);
451
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
452
 
453
  #else
454
  NO_DEVICE_CODE;
@@ -476,7 +476,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
476
 
477
  template<int width = WARP_SIZE>
478
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
479
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
480
  #pragma unroll
481
  for (int offset = width/2; offset > 0; offset >>= 1) {
482
  x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
@@ -485,7 +485,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
485
  #else
486
  GGML_UNUSED(x);
487
  NO_DEVICE_CODE;
488
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
489
  }
490
 
491
  #if CUDART_VERSION < CUDART_HMASK
@@ -497,7 +497,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
497
  #endif // CUDART_VERSION < CUDART_HMASK
498
 
499
  static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
500
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
501
  #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
502
  c = __builtin_amdgcn_sdot4(a, b, c, false);
503
  #elif defined(RDNA3) || defined(RDNA4)
@@ -523,7 +523,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
523
  #endif
524
  return c;
525
 
526
- #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
527
 
528
  #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
529
  return __dp4a(a, b, c);
@@ -533,7 +533,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
533
  return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
534
  #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
535
 
536
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
537
  }
538
 
539
  typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
 
176
  #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
177
  #endif
178
 
179
+ #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
180
  # define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
181
  do { \
182
  static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
 
191
  do { \
192
  GGML_UNUSED(nbytes); \
193
  } while (0)
194
+ #endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
195
 
196
  #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
197
  #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
 
211
  #define GGML_USE_VMM
212
  #endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
213
 
214
+ #if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
215
  #define FP16_AVAILABLE
216
+ #endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
217
 
218
  #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
219
  #define FAST_FP16_AVAILABLE
 
227
  #define FP16_MMA_AVAILABLE
228
  #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
229
 
230
+ #if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
231
  #define AMD_MFMA_AVAILABLE
232
+ #endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
233
 
234
+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
235
  #define NEW_MMA_AVAILABLE
236
+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
237
 
238
+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
239
  #define CP_ASYNC_AVAILABLE
240
+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
241
 
242
  #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
243
  #define FLASH_ATTN_AVAILABLE
 
259
 
260
  // Any FP16 tensor core instructions are available for ggml code.
261
  static bool fp16_mma_available(const int cc) {
262
+ #if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
263
  return false;
264
  #else
265
  if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
 
275
  } else {
276
  return false;
277
  }
278
+ #endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
279
  }
280
 
281
  // To be used for feature selection of external libraries, e.g. cuBLAS.
 
312
  }
313
 
314
  static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
315
+ #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
316
  return 64;
317
  #else
318
  return 32;
319
+ #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
320
  }
321
 
322
  [[noreturn]]
323
  static __device__ void no_device_code(
324
  const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
325
 
326
+ #if defined(GGML_USE_HIP)
327
  printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
328
  file_name, line, function_name, arch);
329
  GGML_UNUSED(arch_list);
330
  #else
331
  printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
332
  file_name, line, function_name, arch, arch_list);
333
+ #endif // defined(GGML_USE_HIP)
334
  __trap();
335
 
336
  GGML_UNUSED(no_device_code); // suppress unused function warning
 
367
 
368
  template<int width = WARP_SIZE>
369
  static __device__ __forceinline__ int warp_reduce_sum(int x) {
370
+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
371
  return __reduce_add_sync(0xffffffff, x);
372
  #else
373
  #pragma unroll
 
375
  x += __shfl_xor_sync(0xffffffff, x, offset, width);
376
  }
377
  return x;
378
+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
379
  }
380
 
381
  template<int width = WARP_SIZE>
 
444
  static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
445
  #ifdef FP16_AVAILABLE
446
 
447
+ #if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
448
  return __float2half(fmaxf(__half2float(a), __half2float(b)));
449
  #else
450
  return __hmax(a, b);
451
+ #endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
452
 
453
  #else
454
  NO_DEVICE_CODE;
 
476
 
477
  template<int width = WARP_SIZE>
478
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
479
+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
480
  #pragma unroll
481
  for (int offset = width/2; offset > 0; offset >>= 1) {
482
  x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
 
485
  #else
486
  GGML_UNUSED(x);
487
  NO_DEVICE_CODE;
488
+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
489
  }
490
 
491
  #if CUDART_VERSION < CUDART_HMASK
 
497
  #endif // CUDART_VERSION < CUDART_HMASK
498
 
499
  static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
500
+ #if defined(GGML_USE_HIP)
501
  #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
502
  c = __builtin_amdgcn_sdot4(a, b, c, false);
503
  #elif defined(RDNA3) || defined(RDNA4)
 
523
  #endif
524
  return c;
525
 
526
+ #else // defined(GGML_USE_HIP)
527
 
528
  #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
529
  return __dp4a(a, b, c);
 
533
  return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
534
  #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
535
 
536
+ #endif // defined(GGML_USE_HIP)
537
  }
538
 
539
  typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
ggml/src/ggml-cuda/fattn-common.cuh CHANGED
@@ -592,9 +592,9 @@ static __global__ void flash_attn_stream_k_fixup(
592
  }
593
 
594
  template<int D> // D == head size
595
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
596
  __launch_bounds__(D, 1)
597
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
598
  static __global__ void flash_attn_combine_results(
599
  const float * __restrict__ VKQ_parts,
600
  const float2 * __restrict__ VKQ_meta,
 
592
  }
593
 
594
  template<int D> // D == head size
595
+ #if !defined(GGML_USE_HIP)
596
  __launch_bounds__(D, 1)
597
+ #endif // !(defined(GGML_USE_HIP)
598
  static __global__ void flash_attn_combine_results(
599
  const float * __restrict__ VKQ_parts,
600
  const float2 * __restrict__ VKQ_meta,
ggml/src/ggml-cuda/fattn-mma-f16.cuh CHANGED
@@ -1391,24 +1391,24 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
1391
  constexpr bool use_logit_softcap = false;
1392
  fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
1393
 
1394
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
1395
  static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
1396
  if (!shared_memory_limit_raised[id]) {
1397
  CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
1398
  shared_memory_limit_raised[id] = true;
1399
  }
1400
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
1401
  } else {
1402
  constexpr bool use_logit_softcap = true;
1403
  fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
1404
 
1405
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
1406
  static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
1407
  if (!shared_memory_limit_raised[id]) {
1408
  CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
1409
  shared_memory_limit_raised[id] = true;
1410
  }
1411
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
1412
  }
1413
 
1414
  launch_fattn<DV, ncols1, ncols2>
 
1391
  constexpr bool use_logit_softcap = false;
1392
  fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
1393
 
1394
+ #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1395
  static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
1396
  if (!shared_memory_limit_raised[id]) {
1397
  CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
1398
  shared_memory_limit_raised[id] = true;
1399
  }
1400
+ #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1401
  } else {
1402
  constexpr bool use_logit_softcap = true;
1403
  fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
1404
 
1405
+ #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1406
  static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
1407
  if (!shared_memory_limit_raised[id]) {
1408
  CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
1409
  shared_memory_limit_raised[id] = true;
1410
  }
1411
+ #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1412
  }
1413
 
1414
  launch_fattn<DV, ncols1, ncols2>
ggml/src/ggml-cuda/fattn-tile-f16.cu CHANGED
@@ -5,9 +5,9 @@
5
  #define FATTN_KQ_STRIDE_TILE_F16 64
6
 
7
  template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
8
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
9
  __launch_bounds__(nwarps*WARP_SIZE, 2)
10
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
11
  static __global__ void flash_attn_tile_ext_f16(
12
  const char * __restrict__ Q,
13
  const char * __restrict__ K,
 
5
  #define FATTN_KQ_STRIDE_TILE_F16 64
6
 
7
  template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
8
+ #if !defined(GGML_USE_HIP)
9
  __launch_bounds__(nwarps*WARP_SIZE, 2)
10
+ #endif // !defined(GGML_USE_HIP)
11
  static __global__ void flash_attn_tile_ext_f16(
12
  const char * __restrict__ Q,
13
  const char * __restrict__ K,
ggml/src/ggml-cuda/fattn-tile-f32.cu CHANGED
@@ -5,9 +5,9 @@
5
  #define FATTN_KQ_STRIDE_TILE_F32 32
6
 
7
  template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
8
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
9
  __launch_bounds__(nwarps*WARP_SIZE, 2)
10
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
11
  static __global__ void flash_attn_tile_ext_f32(
12
  const char * __restrict__ Q,
13
  const char * __restrict__ K,
 
5
  #define FATTN_KQ_STRIDE_TILE_F32 32
6
 
7
  template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
8
+ #if !defined(GGML_USE_HIP)
9
  __launch_bounds__(nwarps*WARP_SIZE, 2)
10
+ #endif // !defined(GGML_USE_HIP)
11
  static __global__ void flash_attn_tile_ext_f32(
12
  const char * __restrict__ Q,
13
  const char * __restrict__ K,
ggml/src/ggml-cuda/fattn-wmma-f16.cu CHANGED
@@ -7,7 +7,7 @@
7
  #include "fattn-wmma-f16.cuh"
8
 
9
  #ifdef FP16_MMA_AVAILABLE
10
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
11
  #include <mma.h>
12
  #ifdef GGML_USE_MUSA
13
  namespace wmma = mtmusa::wmma;
@@ -18,7 +18,7 @@ namespace wmma = nvcuda::wmma;
18
  #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
19
  #include <rocwmma/rocwmma.hpp>
20
  namespace wmma = rocwmma;
21
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
22
  #endif // FP16_MMA_AVAILABLE
23
 
24
  // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
@@ -546,7 +546,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
546
  return;
547
  }
548
 
549
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
550
  if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) {
551
  constexpr int cols_per_block = 8;
552
  switch (Q->ne[0]) {
@@ -568,7 +568,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
568
  }
569
  return;
570
  }
571
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
572
 
573
  if (Q->ne[1] <= 32) {
574
  constexpr int cols_per_block = 16;
 
7
  #include "fattn-wmma-f16.cuh"
8
 
9
  #ifdef FP16_MMA_AVAILABLE
10
+ #if !defined(GGML_USE_HIP)
11
  #include <mma.h>
12
  #ifdef GGML_USE_MUSA
13
  namespace wmma = mtmusa::wmma;
 
18
  #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
19
  #include <rocwmma/rocwmma.hpp>
20
  namespace wmma = rocwmma;
21
+ #endif // !defined(GGML_USE_HIP)
22
  #endif // FP16_MMA_AVAILABLE
23
 
24
  // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
 
546
  return;
547
  }
548
 
549
+ #if !defined(GGML_USE_HIP)
550
  if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) {
551
  constexpr int cols_per_block = 8;
552
  switch (Q->ne[0]) {
 
568
  }
569
  return;
570
  }
571
+ #endif // !defined(GGML_USE_HIP)
572
 
573
  if (Q->ne[1] <= 32) {
574
  constexpr int cols_per_block = 16;
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -128,7 +128,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
128
  return err;
129
  }
130
 
131
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
132
  static int ggml_cuda_parse_id(char devName[]) {
133
  // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
134
  // these values are not stable so this is susceptible to breakage
@@ -175,10 +175,10 @@ static int ggml_cuda_parse_id(char devName[]) {
175
  archNum += archMinor;
176
  return archNum;
177
  }
178
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
179
 
180
  static ggml_cuda_device_info ggml_cuda_init() {
181
- #ifdef __HIP_PLATFORM_AMD__
182
  // Workaround for a rocBLAS bug when using multiple graphics cards:
183
  // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
184
  {
@@ -251,7 +251,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
251
  info.devices[id].nsm = prop.multiProcessorCount;
252
  info.devices[id].smpb = prop.sharedMemPerBlock;
253
  info.devices[id].warp_size = prop.warpSize;
254
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
255
  info.devices[id].smpbo = prop.sharedMemPerBlock;
256
 
257
  info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
@@ -281,7 +281,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
281
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
282
  GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
283
  id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
284
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
285
  }
286
 
287
  for (int id = 0; id < info.device_count; ++id) {
 
128
  return err;
129
  }
130
 
131
+ #if defined(GGML_USE_HIP)
132
  static int ggml_cuda_parse_id(char devName[]) {
133
  // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
134
  // these values are not stable so this is susceptible to breakage
 
175
  archNum += archMinor;
176
  return archNum;
177
  }
178
+ #endif // defined(GGML_USE_HIP)
179
 
180
  static ggml_cuda_device_info ggml_cuda_init() {
181
+ #if defined(GGML_USE_HIP)
182
  // Workaround for a rocBLAS bug when using multiple graphics cards:
183
  // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
184
  {
 
251
  info.devices[id].nsm = prop.multiProcessorCount;
252
  info.devices[id].smpb = prop.sharedMemPerBlock;
253
  info.devices[id].warp_size = prop.warpSize;
254
+ #if defined(GGML_USE_HIP)
255
  info.devices[id].smpbo = prop.sharedMemPerBlock;
256
 
257
  info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
 
281
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
282
  GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
283
  id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
284
+ #endif // defined(GGML_USE_HIP)
285
  }
286
 
287
  for (int id = 0; id < info.device_count; ++id) {
ggml/src/ggml-cuda/mma.cuh CHANGED
@@ -68,7 +68,7 @@ namespace ggml_cuda_mma {
68
  static constexpr int I = I_;
69
  static constexpr int J = J_;
70
 
71
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
72
  static constexpr int ne = I * J / 64;
73
  T x[ne] = {0};
74
 
@@ -132,7 +132,7 @@ namespace ggml_cuda_mma {
132
  static_assert(I == -1 && J == -1, "template specialization not implemented");
133
  }
134
  }
135
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
136
  };
137
 
138
  template <int I_, int J_>
 
68
  static constexpr int I = I_;
69
  static constexpr int J = J_;
70
 
71
+ #if defined(GGML_USE_HIP)
72
  static constexpr int ne = I * J / 64;
73
  T x[ne] = {0};
74
 
 
132
  static_assert(I == -1 && J == -1, "template specialization not implemented");
133
  }
134
  }
135
+ #endif // defined(GGML_USE_HIP)
136
  };
137
 
138
  template <int I_, int J_>
ggml/src/ggml-cuda/mmq.cuh CHANGED
@@ -104,9 +104,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
104
  return 128;
105
  #else // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
106
 
107
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
108
  return 64;
109
- #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
110
 
111
  #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
112
  #ifdef GGML_CUDA_FORCE_MMQ
@@ -118,7 +118,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
118
  return 64;
119
  #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
120
 
121
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
122
  #endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
123
  }
124
 
@@ -128,7 +128,7 @@ static int get_mmq_y_host(const int cc) {
128
  }
129
 
130
  static constexpr __device__ int get_mmq_y_device() {
131
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
132
  #if defined(RDNA1)
133
  return 64;
134
  #else
@@ -140,7 +140,7 @@ static constexpr __device__ int get_mmq_y_device() {
140
  #else
141
  return 64;
142
  #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
143
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
144
  }
145
 
146
  // Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes.
@@ -250,7 +250,7 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/)
250
  }
251
  #endif // AMD_MFMA_AVAILABLE
252
 
253
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
254
  static int mmq_get_nwarps_host(const int cc) {
255
  return amd_mfma_available(cc) ? 8 : 4;
256
  }
@@ -258,10 +258,10 @@ static int mmq_get_nwarps_host(const int cc) {
258
  static int mmq_get_nwarps_host(const int /*cc*/) {
259
  return 8;
260
  }
261
- #endif // (GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
262
 
263
  static constexpr __device__ int mmq_get_nwarps_device() {
264
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
265
  #if defined(AMD_MFMA_AVAILABLE)
266
  return 8;
267
  #else
@@ -269,7 +269,7 @@ static constexpr __device__ int mmq_get_nwarps_device() {
269
  #endif // AMD_MFMA_AVAILABLE
270
  #else
271
  return 8;
272
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
273
  }
274
 
275
  // ------------------------------------------------------------
@@ -3047,7 +3047,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
3047
  // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
3048
 
3049
  template <ggml_type type, int mmq_x, bool need_check>
3050
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
3051
  #if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
3052
  __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
3053
  #endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
@@ -3057,7 +3057,7 @@ template <ggml_type type, int mmq_x, bool need_check>
3057
  #else
3058
  __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
3059
  #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
3060
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
3061
  static __global__ void mul_mat_q(
3062
  const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
3063
  const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
@@ -3097,7 +3097,7 @@ static __global__ void mul_mat_q(
3097
  __syncthreads();
3098
 
3099
  // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
3100
- #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
3101
  {
3102
  const int wt = blockIdx.z / nchannels_y;
3103
  const int zt = blockIdx.z - wt*nchannels_y;
@@ -3151,7 +3151,7 @@ static __global__ void mul_mat_q(
3151
  tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
3152
  return;
3153
  }
3154
- #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
3155
 
3156
  const int64_t blocks_per_ne00 = ncols_x / qk;
3157
  constexpr int blocks_per_iter = MMQ_ITER_K / qk;
 
104
  return 128;
105
  #else // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
106
 
107
+ #if defined(GGML_USE_HIP)
108
  return 64;
109
+ #else // defined(GGML_USE_HIP)
110
 
111
  #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
112
  #ifdef GGML_CUDA_FORCE_MMQ
 
118
  return 64;
119
  #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
120
 
121
+ #endif // defined(GGML_USE_HIP)
122
  #endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
123
  }
124
 
 
128
  }
129
 
130
  static constexpr __device__ int get_mmq_y_device() {
131
+ #if defined(GGML_USE_HIP)
132
  #if defined(RDNA1)
133
  return 64;
134
  #else
 
140
  #else
141
  return 64;
142
  #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
143
+ #endif // defined(GGML_USE_HIP)
144
  }
145
 
146
  // Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes.
 
250
  }
251
  #endif // AMD_MFMA_AVAILABLE
252
 
253
+ #if defined(GGML_USE_HIP)
254
  static int mmq_get_nwarps_host(const int cc) {
255
  return amd_mfma_available(cc) ? 8 : 4;
256
  }
 
258
  static int mmq_get_nwarps_host(const int /*cc*/) {
259
  return 8;
260
  }
261
+ #endif // (GGML_USE_HIP)
262
 
263
  static constexpr __device__ int mmq_get_nwarps_device() {
264
+ #if defined(GGML_USE_HIP)
265
  #if defined(AMD_MFMA_AVAILABLE)
266
  return 8;
267
  #else
 
269
  #endif // AMD_MFMA_AVAILABLE
270
  #else
271
  return 8;
272
+ #endif // defined(GGML_USE_HIP)
273
  }
274
 
275
  // ------------------------------------------------------------
 
3047
  // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
3048
 
3049
  template <ggml_type type, int mmq_x, bool need_check>
3050
+ #if defined(GGML_USE_HIP)
3051
  #if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
3052
  __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
3053
  #endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
 
3057
  #else
3058
  __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
3059
  #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
3060
+ #endif // defined(GGML_USE_HIP)
3061
  static __global__ void mul_mat_q(
3062
  const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
3063
  const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
 
3097
  __syncthreads();
3098
 
3099
  // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
3100
+ #if (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
3101
  {
3102
  const int wt = blockIdx.z / nchannels_y;
3103
  const int zt = blockIdx.z - wt*nchannels_y;
 
3151
  tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
3152
  return;
3153
  }
3154
+ #endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
3155
 
3156
  const int64_t blocks_per_ne00 = ncols_x / qk;
3157
  constexpr int blocks_per_iter = MMQ_ITER_K / qk;
ggml/src/ggml-cuda/vendors/hip.h CHANGED
@@ -5,10 +5,8 @@
5
  #include <hipblas/hipblas.h>
6
  #include <hip/hip_fp16.h>
7
  #include <hip/hip_bfloat16.h>
8
- #ifdef __HIP_PLATFORM_AMD__
9
  // for rocblas_initialize()
10
  #include "rocblas/rocblas.h"
11
- #endif // __HIP_PLATFORM_AMD__
12
 
13
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
14
  #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
@@ -139,7 +137,7 @@
139
  #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
140
  #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
141
 
142
- #if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 70000000
143
  #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
144
  #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
145
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
@@ -151,7 +149,11 @@
151
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
152
  #define cublasComputeType_t hipblasDatatype_t
153
  #define cudaDataType_t hipblasDatatype_t
154
- #endif
 
 
 
 
155
 
156
  #define __CUDA_ARCH__ 1300
157
 
@@ -249,7 +251,7 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne
249
  return c;
250
  }
251
 
252
- #if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
253
  // __shfl_xor() for half2 was added in ROCm 5.6
254
  static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
255
  typedef union half2_b32 {
@@ -261,4 +263,4 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
261
  tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
262
  return tmp.val;
263
  }
264
- #endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
 
5
  #include <hipblas/hipblas.h>
6
  #include <hip/hip_fp16.h>
7
  #include <hip/hip_bfloat16.h>
 
8
  // for rocblas_initialize()
9
  #include "rocblas/rocblas.h"
 
10
 
11
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
12
  #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
 
137
  #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
138
  #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
139
 
140
+ #if HIP_VERSION >= 70000000
141
  #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
142
  #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
143
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
 
149
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
150
  #define cublasComputeType_t hipblasDatatype_t
151
  #define cudaDataType_t hipblasDatatype_t
152
+ #endif // HIP_VERSION >= 7000000
153
+
154
+ #if !defined(__HIP_PLATFORM_AMD__)
155
+ #error "The HIP backend supports only AMD targets"
156
+ #endif // !defined(__HIP_PLATFORM_AMD__)
157
 
158
  #define __CUDA_ARCH__ 1300
159
 
 
251
  return c;
252
  }
253
 
254
+ #if HIP_VERSION < 50600000
255
  // __shfl_xor() for half2 was added in ROCm 5.6
256
  static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
257
  typedef union half2_b32 {
 
263
  tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
264
  return tmp.val;
265
  }
266
+ #endif // HIP_VERSION < 50600000