Spaces:
Sleeping
Sleeping
HIP: remove the use of __HIP_PLATFORM_AMD__, explicitly support only AMD targets (llama/14945)
Browse files- ggml/src/ggml-cuda/common.cuh +25 -25
- ggml/src/ggml-cuda/fattn-common.cuh +2 -2
- ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -4
- ggml/src/ggml-cuda/fattn-tile-f16.cu +2 -2
- ggml/src/ggml-cuda/fattn-tile-f32.cu +2 -2
- ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -4
- ggml/src/ggml-cuda/ggml-cuda.cu +5 -5
- ggml/src/ggml-cuda/mma.cuh +2 -2
- ggml/src/ggml-cuda/mmq.cuh +13 -13
- ggml/src/ggml-cuda/vendors/hip.h +8 -6
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -176,7 +176,7 @@ static const char * cu_get_error_str(CUresult err) {
|
|
| 176 |
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
| 177 |
#endif
|
| 178 |
|
| 179 |
-
#if !
|
| 180 |
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
| 181 |
do { \
|
| 182 |
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
|
|
@@ -191,7 +191,7 @@ static const char * cu_get_error_str(CUresult err) {
|
|
| 191 |
do { \
|
| 192 |
GGML_UNUSED(nbytes); \
|
| 193 |
} while (0)
|
| 194 |
-
#endif // !(defined(GGML_USE_HIP) &&
|
| 195 |
|
| 196 |
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
| 197 |
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
|
@@ -211,9 +211,9 @@ typedef float2 dfloat2;
|
|
| 211 |
#define GGML_USE_VMM
|
| 212 |
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
|
| 213 |
|
| 214 |
-
#if
|
| 215 |
#define FP16_AVAILABLE
|
| 216 |
-
#endif //
|
| 217 |
|
| 218 |
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
| 219 |
#define FAST_FP16_AVAILABLE
|
|
@@ -227,17 +227,17 @@ typedef float2 dfloat2;
|
|
| 227 |
#define FP16_MMA_AVAILABLE
|
| 228 |
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
| 229 |
|
| 230 |
-
#if defined(GGML_USE_HIP) && defined(
|
| 231 |
#define AMD_MFMA_AVAILABLE
|
| 232 |
-
#endif // defined(GGML_USE_HIP) && defined(
|
| 233 |
|
| 234 |
-
#if !
|
| 235 |
#define NEW_MMA_AVAILABLE
|
| 236 |
-
#endif // !
|
| 237 |
|
| 238 |
-
#if !
|
| 239 |
#define CP_ASYNC_AVAILABLE
|
| 240 |
-
#endif // !
|
| 241 |
|
| 242 |
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
| 243 |
#define FLASH_ATTN_AVAILABLE
|
|
@@ -259,7 +259,7 @@ static bool fast_fp16_hardware_available(const int cc) {
|
|
| 259 |
|
| 260 |
// Any FP16 tensor core instructions are available for ggml code.
|
| 261 |
static bool fp16_mma_available(const int cc) {
|
| 262 |
-
#if defined(GGML_USE_HIP) &&
|
| 263 |
return false;
|
| 264 |
#else
|
| 265 |
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
|
@@ -275,7 +275,7 @@ static bool fp16_mma_available(const int cc) {
|
|
| 275 |
} else {
|
| 276 |
return false;
|
| 277 |
}
|
| 278 |
-
#endif // defined(GGML_USE_HIP) &&
|
| 279 |
}
|
| 280 |
|
| 281 |
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
|
@@ -312,25 +312,25 @@ static bool cp_async_available(const int cc) {
|
|
| 312 |
}
|
| 313 |
|
| 314 |
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
| 315 |
-
#if defined(GGML_USE_HIP) &&
|
| 316 |
return 64;
|
| 317 |
#else
|
| 318 |
return 32;
|
| 319 |
-
#endif // defined(GGML_USE_HIP) &&
|
| 320 |
}
|
| 321 |
|
| 322 |
[[noreturn]]
|
| 323 |
static __device__ void no_device_code(
|
| 324 |
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
| 325 |
|
| 326 |
-
#if defined(GGML_USE_HIP)
|
| 327 |
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
| 328 |
file_name, line, function_name, arch);
|
| 329 |
GGML_UNUSED(arch_list);
|
| 330 |
#else
|
| 331 |
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
| 332 |
file_name, line, function_name, arch, arch_list);
|
| 333 |
-
#endif // defined(GGML_USE_HIP)
|
| 334 |
__trap();
|
| 335 |
|
| 336 |
GGML_UNUSED(no_device_code); // suppress unused function warning
|
|
@@ -367,7 +367,7 @@ struct ggml_cuda_unroll<1> {
|
|
| 367 |
|
| 368 |
template<int width = WARP_SIZE>
|
| 369 |
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
| 370 |
-
#if !
|
| 371 |
return __reduce_add_sync(0xffffffff, x);
|
| 372 |
#else
|
| 373 |
#pragma unroll
|
|
@@ -375,7 +375,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
|
| 375 |
x += __shfl_xor_sync(0xffffffff, x, offset, width);
|
| 376 |
}
|
| 377 |
return x;
|
| 378 |
-
#endif // !
|
| 379 |
}
|
| 380 |
|
| 381 |
template<int width = WARP_SIZE>
|
|
@@ -444,11 +444,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
| 444 |
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
| 445 |
#ifdef FP16_AVAILABLE
|
| 446 |
|
| 447 |
-
#if !
|
| 448 |
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
| 449 |
#else
|
| 450 |
return __hmax(a, b);
|
| 451 |
-
#endif // !
|
| 452 |
|
| 453 |
#else
|
| 454 |
NO_DEVICE_CODE;
|
|
@@ -476,7 +476,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
|
|
| 476 |
|
| 477 |
template<int width = WARP_SIZE>
|
| 478 |
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 479 |
-
#if !
|
| 480 |
#pragma unroll
|
| 481 |
for (int offset = width/2; offset > 0; offset >>= 1) {
|
| 482 |
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
|
|
@@ -485,7 +485,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|
| 485 |
#else
|
| 486 |
GGML_UNUSED(x);
|
| 487 |
NO_DEVICE_CODE;
|
| 488 |
-
#endif // !
|
| 489 |
}
|
| 490 |
|
| 491 |
#if CUDART_VERSION < CUDART_HMASK
|
|
@@ -497,7 +497,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
|
| 497 |
#endif // CUDART_VERSION < CUDART_HMASK
|
| 498 |
|
| 499 |
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
| 500 |
-
#if defined(GGML_USE_HIP)
|
| 501 |
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
|
| 502 |
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
| 503 |
#elif defined(RDNA3) || defined(RDNA4)
|
|
@@ -523,7 +523,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|
| 523 |
#endif
|
| 524 |
return c;
|
| 525 |
|
| 526 |
-
#else // defined(GGML_USE_HIP)
|
| 527 |
|
| 528 |
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
| 529 |
return __dp4a(a, b, c);
|
|
@@ -533,7 +533,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|
| 533 |
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
| 534 |
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
| 535 |
|
| 536 |
-
#endif // defined(GGML_USE_HIP)
|
| 537 |
}
|
| 538 |
|
| 539 |
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
|
|
|
| 176 |
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
| 177 |
#endif
|
| 178 |
|
| 179 |
+
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
| 180 |
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
| 181 |
do { \
|
| 182 |
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
|
|
|
|
| 191 |
do { \
|
| 192 |
GGML_UNUSED(nbytes); \
|
| 193 |
} while (0)
|
| 194 |
+
#endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
| 195 |
|
| 196 |
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
| 197 |
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
|
|
|
| 211 |
#define GGML_USE_VMM
|
| 212 |
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
|
| 213 |
|
| 214 |
+
#if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
| 215 |
#define FP16_AVAILABLE
|
| 216 |
+
#endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
| 217 |
|
| 218 |
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
| 219 |
#define FAST_FP16_AVAILABLE
|
|
|
|
| 227 |
#define FP16_MMA_AVAILABLE
|
| 228 |
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
| 229 |
|
| 230 |
+
#if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
| 231 |
#define AMD_MFMA_AVAILABLE
|
| 232 |
+
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
| 233 |
|
| 234 |
+
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
| 235 |
#define NEW_MMA_AVAILABLE
|
| 236 |
+
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
| 237 |
|
| 238 |
+
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 239 |
#define CP_ASYNC_AVAILABLE
|
| 240 |
+
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 241 |
|
| 242 |
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
| 243 |
#define FLASH_ATTN_AVAILABLE
|
|
|
|
| 259 |
|
| 260 |
// Any FP16 tensor core instructions are available for ggml code.
|
| 261 |
static bool fp16_mma_available(const int cc) {
|
| 262 |
+
#if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
| 263 |
return false;
|
| 264 |
#else
|
| 265 |
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
|
|
|
| 275 |
} else {
|
| 276 |
return false;
|
| 277 |
}
|
| 278 |
+
#endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
| 279 |
}
|
| 280 |
|
| 281 |
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
|
|
|
| 312 |
}
|
| 313 |
|
| 314 |
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
| 315 |
+
#if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
|
| 316 |
return 64;
|
| 317 |
#else
|
| 318 |
return 32;
|
| 319 |
+
#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
|
| 320 |
}
|
| 321 |
|
| 322 |
[[noreturn]]
|
| 323 |
static __device__ void no_device_code(
|
| 324 |
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
| 325 |
|
| 326 |
+
#if defined(GGML_USE_HIP)
|
| 327 |
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
| 328 |
file_name, line, function_name, arch);
|
| 329 |
GGML_UNUSED(arch_list);
|
| 330 |
#else
|
| 331 |
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
| 332 |
file_name, line, function_name, arch, arch_list);
|
| 333 |
+
#endif // defined(GGML_USE_HIP)
|
| 334 |
__trap();
|
| 335 |
|
| 336 |
GGML_UNUSED(no_device_code); // suppress unused function warning
|
|
|
|
| 367 |
|
| 368 |
template<int width = WARP_SIZE>
|
| 369 |
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
| 370 |
+
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 371 |
return __reduce_add_sync(0xffffffff, x);
|
| 372 |
#else
|
| 373 |
#pragma unroll
|
|
|
|
| 375 |
x += __shfl_xor_sync(0xffffffff, x, offset, width);
|
| 376 |
}
|
| 377 |
return x;
|
| 378 |
+
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 379 |
}
|
| 380 |
|
| 381 |
template<int width = WARP_SIZE>
|
|
|
|
| 444 |
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
| 445 |
#ifdef FP16_AVAILABLE
|
| 446 |
|
| 447 |
+
#if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
|
| 448 |
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
| 449 |
#else
|
| 450 |
return __hmax(a, b);
|
| 451 |
+
#endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
|
| 452 |
|
| 453 |
#else
|
| 454 |
NO_DEVICE_CODE;
|
|
|
|
| 476 |
|
| 477 |
template<int width = WARP_SIZE>
|
| 478 |
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 479 |
+
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
| 480 |
#pragma unroll
|
| 481 |
for (int offset = width/2; offset > 0; offset >>= 1) {
|
| 482 |
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
|
|
|
|
| 485 |
#else
|
| 486 |
GGML_UNUSED(x);
|
| 487 |
NO_DEVICE_CODE;
|
| 488 |
+
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
| 489 |
}
|
| 490 |
|
| 491 |
#if CUDART_VERSION < CUDART_HMASK
|
|
|
|
| 497 |
#endif // CUDART_VERSION < CUDART_HMASK
|
| 498 |
|
| 499 |
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
| 500 |
+
#if defined(GGML_USE_HIP)
|
| 501 |
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
|
| 502 |
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
| 503 |
#elif defined(RDNA3) || defined(RDNA4)
|
|
|
|
| 523 |
#endif
|
| 524 |
return c;
|
| 525 |
|
| 526 |
+
#else // defined(GGML_USE_HIP)
|
| 527 |
|
| 528 |
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
| 529 |
return __dp4a(a, b, c);
|
|
|
|
| 533 |
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
| 534 |
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
| 535 |
|
| 536 |
+
#endif // defined(GGML_USE_HIP)
|
| 537 |
}
|
| 538 |
|
| 539 |
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
ggml/src/ggml-cuda/fattn-common.cuh
CHANGED
|
@@ -592,9 +592,9 @@ static __global__ void flash_attn_stream_k_fixup(
|
|
| 592 |
}
|
| 593 |
|
| 594 |
template<int D> // D == head size
|
| 595 |
-
#if !
|
| 596 |
__launch_bounds__(D, 1)
|
| 597 |
-
#endif // !(defined(GGML_USE_HIP)
|
| 598 |
static __global__ void flash_attn_combine_results(
|
| 599 |
const float * __restrict__ VKQ_parts,
|
| 600 |
const float2 * __restrict__ VKQ_meta,
|
|
|
|
| 592 |
}
|
| 593 |
|
| 594 |
template<int D> // D == head size
|
| 595 |
+
#if !defined(GGML_USE_HIP)
|
| 596 |
__launch_bounds__(D, 1)
|
| 597 |
+
#endif // !(defined(GGML_USE_HIP)
|
| 598 |
static __global__ void flash_attn_combine_results(
|
| 599 |
const float * __restrict__ VKQ_parts,
|
| 600 |
const float2 * __restrict__ VKQ_meta,
|
ggml/src/ggml-cuda/fattn-mma-f16.cuh
CHANGED
|
@@ -1391,24 +1391,24 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
|
|
| 1391 |
constexpr bool use_logit_softcap = false;
|
| 1392 |
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
|
| 1393 |
|
| 1394 |
-
#if !
|
| 1395 |
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
| 1396 |
if (!shared_memory_limit_raised[id]) {
|
| 1397 |
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
|
| 1398 |
shared_memory_limit_raised[id] = true;
|
| 1399 |
}
|
| 1400 |
-
#endif // !
|
| 1401 |
} else {
|
| 1402 |
constexpr bool use_logit_softcap = true;
|
| 1403 |
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
|
| 1404 |
|
| 1405 |
-
#if !
|
| 1406 |
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
| 1407 |
if (!shared_memory_limit_raised[id]) {
|
| 1408 |
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
|
| 1409 |
shared_memory_limit_raised[id] = true;
|
| 1410 |
}
|
| 1411 |
-
#endif // !
|
| 1412 |
}
|
| 1413 |
|
| 1414 |
launch_fattn<DV, ncols1, ncols2>
|
|
|
|
| 1391 |
constexpr bool use_logit_softcap = false;
|
| 1392 |
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
|
| 1393 |
|
| 1394 |
+
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
| 1395 |
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
| 1396 |
if (!shared_memory_limit_raised[id]) {
|
| 1397 |
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
|
| 1398 |
shared_memory_limit_raised[id] = true;
|
| 1399 |
}
|
| 1400 |
+
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
| 1401 |
} else {
|
| 1402 |
constexpr bool use_logit_softcap = true;
|
| 1403 |
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
|
| 1404 |
|
| 1405 |
+
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
| 1406 |
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
| 1407 |
if (!shared_memory_limit_raised[id]) {
|
| 1408 |
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
|
| 1409 |
shared_memory_limit_raised[id] = true;
|
| 1410 |
}
|
| 1411 |
+
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
| 1412 |
}
|
| 1413 |
|
| 1414 |
launch_fattn<DV, ncols1, ncols2>
|
ggml/src/ggml-cuda/fattn-tile-f16.cu
CHANGED
|
@@ -5,9 +5,9 @@
|
|
| 5 |
#define FATTN_KQ_STRIDE_TILE_F16 64
|
| 6 |
|
| 7 |
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
| 8 |
-
#if !
|
| 9 |
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
| 10 |
-
#endif // !
|
| 11 |
static __global__ void flash_attn_tile_ext_f16(
|
| 12 |
const char * __restrict__ Q,
|
| 13 |
const char * __restrict__ K,
|
|
|
|
| 5 |
#define FATTN_KQ_STRIDE_TILE_F16 64
|
| 6 |
|
| 7 |
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
| 8 |
+
#if !defined(GGML_USE_HIP)
|
| 9 |
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
| 10 |
+
#endif // !defined(GGML_USE_HIP)
|
| 11 |
static __global__ void flash_attn_tile_ext_f16(
|
| 12 |
const char * __restrict__ Q,
|
| 13 |
const char * __restrict__ K,
|
ggml/src/ggml-cuda/fattn-tile-f32.cu
CHANGED
|
@@ -5,9 +5,9 @@
|
|
| 5 |
#define FATTN_KQ_STRIDE_TILE_F32 32
|
| 6 |
|
| 7 |
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
| 8 |
-
#if !
|
| 9 |
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
| 10 |
-
#endif // !
|
| 11 |
static __global__ void flash_attn_tile_ext_f32(
|
| 12 |
const char * __restrict__ Q,
|
| 13 |
const char * __restrict__ K,
|
|
|
|
| 5 |
#define FATTN_KQ_STRIDE_TILE_F32 32
|
| 6 |
|
| 7 |
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
| 8 |
+
#if !defined(GGML_USE_HIP)
|
| 9 |
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
| 10 |
+
#endif // !defined(GGML_USE_HIP)
|
| 11 |
static __global__ void flash_attn_tile_ext_f32(
|
| 12 |
const char * __restrict__ Q,
|
| 13 |
const char * __restrict__ K,
|
ggml/src/ggml-cuda/fattn-wmma-f16.cu
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
#include "fattn-wmma-f16.cuh"
|
| 8 |
|
| 9 |
#ifdef FP16_MMA_AVAILABLE
|
| 10 |
-
#if !
|
| 11 |
#include <mma.h>
|
| 12 |
#ifdef GGML_USE_MUSA
|
| 13 |
namespace wmma = mtmusa::wmma;
|
|
@@ -18,7 +18,7 @@ namespace wmma = nvcuda::wmma;
|
|
| 18 |
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
| 19 |
#include <rocwmma/rocwmma.hpp>
|
| 20 |
namespace wmma = rocwmma;
|
| 21 |
-
#endif // !
|
| 22 |
#endif // FP16_MMA_AVAILABLE
|
| 23 |
|
| 24 |
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
|
@@ -546,7 +546,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
|
| 546 |
return;
|
| 547 |
}
|
| 548 |
|
| 549 |
-
#if !
|
| 550 |
if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) {
|
| 551 |
constexpr int cols_per_block = 8;
|
| 552 |
switch (Q->ne[0]) {
|
|
@@ -568,7 +568,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
|
| 568 |
}
|
| 569 |
return;
|
| 570 |
}
|
| 571 |
-
#endif // !
|
| 572 |
|
| 573 |
if (Q->ne[1] <= 32) {
|
| 574 |
constexpr int cols_per_block = 16;
|
|
|
|
| 7 |
#include "fattn-wmma-f16.cuh"
|
| 8 |
|
| 9 |
#ifdef FP16_MMA_AVAILABLE
|
| 10 |
+
#if !defined(GGML_USE_HIP)
|
| 11 |
#include <mma.h>
|
| 12 |
#ifdef GGML_USE_MUSA
|
| 13 |
namespace wmma = mtmusa::wmma;
|
|
|
|
| 18 |
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
| 19 |
#include <rocwmma/rocwmma.hpp>
|
| 20 |
namespace wmma = rocwmma;
|
| 21 |
+
#endif // !defined(GGML_USE_HIP)
|
| 22 |
#endif // FP16_MMA_AVAILABLE
|
| 23 |
|
| 24 |
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
|
|
|
| 546 |
return;
|
| 547 |
}
|
| 548 |
|
| 549 |
+
#if !defined(GGML_USE_HIP)
|
| 550 |
if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) {
|
| 551 |
constexpr int cols_per_block = 8;
|
| 552 |
switch (Q->ne[0]) {
|
|
|
|
| 568 |
}
|
| 569 |
return;
|
| 570 |
}
|
| 571 |
+
#endif // !defined(GGML_USE_HIP)
|
| 572 |
|
| 573 |
if (Q->ne[1] <= 32) {
|
| 574 |
constexpr int cols_per_block = 16;
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -128,7 +128,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
|
| 128 |
return err;
|
| 129 |
}
|
| 130 |
|
| 131 |
-
#if defined(GGML_USE_HIP)
|
| 132 |
static int ggml_cuda_parse_id(char devName[]) {
|
| 133 |
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
|
| 134 |
// these values are not stable so this is susceptible to breakage
|
|
@@ -175,10 +175,10 @@ static int ggml_cuda_parse_id(char devName[]) {
|
|
| 175 |
archNum += archMinor;
|
| 176 |
return archNum;
|
| 177 |
}
|
| 178 |
-
#endif // defined(GGML_USE_HIP)
|
| 179 |
|
| 180 |
static ggml_cuda_device_info ggml_cuda_init() {
|
| 181 |
-
#
|
| 182 |
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
| 183 |
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
| 184 |
{
|
|
@@ -251,7 +251,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 251 |
info.devices[id].nsm = prop.multiProcessorCount;
|
| 252 |
info.devices[id].smpb = prop.sharedMemPerBlock;
|
| 253 |
info.devices[id].warp_size = prop.warpSize;
|
| 254 |
-
#if defined(GGML_USE_HIP)
|
| 255 |
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
| 256 |
|
| 257 |
info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
|
|
@@ -281,7 +281,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 281 |
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
| 282 |
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
| 283 |
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
| 284 |
-
#endif // defined(GGML_USE_HIP)
|
| 285 |
}
|
| 286 |
|
| 287 |
for (int id = 0; id < info.device_count; ++id) {
|
|
|
|
| 128 |
return err;
|
| 129 |
}
|
| 130 |
|
| 131 |
+
#if defined(GGML_USE_HIP)
|
| 132 |
static int ggml_cuda_parse_id(char devName[]) {
|
| 133 |
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
|
| 134 |
// these values are not stable so this is susceptible to breakage
|
|
|
|
| 175 |
archNum += archMinor;
|
| 176 |
return archNum;
|
| 177 |
}
|
| 178 |
+
#endif // defined(GGML_USE_HIP)
|
| 179 |
|
| 180 |
static ggml_cuda_device_info ggml_cuda_init() {
|
| 181 |
+
#if defined(GGML_USE_HIP)
|
| 182 |
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
| 183 |
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
| 184 |
{
|
|
|
|
| 251 |
info.devices[id].nsm = prop.multiProcessorCount;
|
| 252 |
info.devices[id].smpb = prop.sharedMemPerBlock;
|
| 253 |
info.devices[id].warp_size = prop.warpSize;
|
| 254 |
+
#if defined(GGML_USE_HIP)
|
| 255 |
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
| 256 |
|
| 257 |
info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
|
|
|
|
| 281 |
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
| 282 |
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
| 283 |
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
| 284 |
+
#endif // defined(GGML_USE_HIP)
|
| 285 |
}
|
| 286 |
|
| 287 |
for (int id = 0; id < info.device_count; ++id) {
|
ggml/src/ggml-cuda/mma.cuh
CHANGED
|
@@ -68,7 +68,7 @@ namespace ggml_cuda_mma {
|
|
| 68 |
static constexpr int I = I_;
|
| 69 |
static constexpr int J = J_;
|
| 70 |
|
| 71 |
-
#if defined(GGML_USE_HIP)
|
| 72 |
static constexpr int ne = I * J / 64;
|
| 73 |
T x[ne] = {0};
|
| 74 |
|
|
@@ -132,7 +132,7 @@ namespace ggml_cuda_mma {
|
|
| 132 |
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
| 133 |
}
|
| 134 |
}
|
| 135 |
-
#endif // defined(GGML_USE_HIP)
|
| 136 |
};
|
| 137 |
|
| 138 |
template <int I_, int J_>
|
|
|
|
| 68 |
static constexpr int I = I_;
|
| 69 |
static constexpr int J = J_;
|
| 70 |
|
| 71 |
+
#if defined(GGML_USE_HIP)
|
| 72 |
static constexpr int ne = I * J / 64;
|
| 73 |
T x[ne] = {0};
|
| 74 |
|
|
|
|
| 132 |
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
| 133 |
}
|
| 134 |
}
|
| 135 |
+
#endif // defined(GGML_USE_HIP)
|
| 136 |
};
|
| 137 |
|
| 138 |
template <int I_, int J_>
|
ggml/src/ggml-cuda/mmq.cuh
CHANGED
|
@@ -104,9 +104,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
|
| 104 |
return 128;
|
| 105 |
#else // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
| 106 |
|
| 107 |
-
#if defined(GGML_USE_HIP)
|
| 108 |
return 64;
|
| 109 |
-
#else // defined(GGML_USE_HIP)
|
| 110 |
|
| 111 |
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 112 |
#ifdef GGML_CUDA_FORCE_MMQ
|
|
@@ -118,7 +118,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
|
| 118 |
return 64;
|
| 119 |
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 120 |
|
| 121 |
-
#endif // defined(GGML_USE_HIP)
|
| 122 |
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
| 123 |
}
|
| 124 |
|
|
@@ -128,7 +128,7 @@ static int get_mmq_y_host(const int cc) {
|
|
| 128 |
}
|
| 129 |
|
| 130 |
static constexpr __device__ int get_mmq_y_device() {
|
| 131 |
-
#if defined(GGML_USE_HIP)
|
| 132 |
#if defined(RDNA1)
|
| 133 |
return 64;
|
| 134 |
#else
|
|
@@ -140,7 +140,7 @@ static constexpr __device__ int get_mmq_y_device() {
|
|
| 140 |
#else
|
| 141 |
return 64;
|
| 142 |
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 143 |
-
#endif // defined(GGML_USE_HIP)
|
| 144 |
}
|
| 145 |
|
| 146 |
// Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes.
|
|
@@ -250,7 +250,7 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/)
|
|
| 250 |
}
|
| 251 |
#endif // AMD_MFMA_AVAILABLE
|
| 252 |
|
| 253 |
-
#if defined(GGML_USE_HIP)
|
| 254 |
static int mmq_get_nwarps_host(const int cc) {
|
| 255 |
return amd_mfma_available(cc) ? 8 : 4;
|
| 256 |
}
|
|
@@ -258,10 +258,10 @@ static int mmq_get_nwarps_host(const int cc) {
|
|
| 258 |
static int mmq_get_nwarps_host(const int /*cc*/) {
|
| 259 |
return 8;
|
| 260 |
}
|
| 261 |
-
#endif // (GGML_USE_HIP)
|
| 262 |
|
| 263 |
static constexpr __device__ int mmq_get_nwarps_device() {
|
| 264 |
-
#if defined(GGML_USE_HIP)
|
| 265 |
#if defined(AMD_MFMA_AVAILABLE)
|
| 266 |
return 8;
|
| 267 |
#else
|
|
@@ -269,7 +269,7 @@ static constexpr __device__ int mmq_get_nwarps_device() {
|
|
| 269 |
#endif // AMD_MFMA_AVAILABLE
|
| 270 |
#else
|
| 271 |
return 8;
|
| 272 |
-
#endif // defined(GGML_USE_HIP)
|
| 273 |
}
|
| 274 |
|
| 275 |
// ------------------------------------------------------------
|
|
@@ -3047,7 +3047,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
|
|
| 3047 |
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
|
| 3048 |
|
| 3049 |
template <ggml_type type, int mmq_x, bool need_check>
|
| 3050 |
-
#if defined(GGML_USE_HIP)
|
| 3051 |
#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
| 3052 |
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
|
| 3053 |
#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
|
@@ -3057,7 +3057,7 @@ template <ggml_type type, int mmq_x, bool need_check>
|
|
| 3057 |
#else
|
| 3058 |
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
|
| 3059 |
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 3060 |
-
#endif // defined(GGML_USE_HIP)
|
| 3061 |
static __global__ void mul_mat_q(
|
| 3062 |
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
|
| 3063 |
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
|
@@ -3097,7 +3097,7 @@ static __global__ void mul_mat_q(
|
|
| 3097 |
__syncthreads();
|
| 3098 |
|
| 3099 |
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
| 3100 |
-
#if (defined(GGML_USE_HIP) &&
|
| 3101 |
{
|
| 3102 |
const int wt = blockIdx.z / nchannels_y;
|
| 3103 |
const int zt = blockIdx.z - wt*nchannels_y;
|
|
@@ -3151,7 +3151,7 @@ static __global__ void mul_mat_q(
|
|
| 3151 |
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
|
| 3152 |
return;
|
| 3153 |
}
|
| 3154 |
-
#endif // (defined(GGML_USE_HIP) &&
|
| 3155 |
|
| 3156 |
const int64_t blocks_per_ne00 = ncols_x / qk;
|
| 3157 |
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
|
|
|
| 104 |
return 128;
|
| 105 |
#else // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
| 106 |
|
| 107 |
+
#if defined(GGML_USE_HIP)
|
| 108 |
return 64;
|
| 109 |
+
#else // defined(GGML_USE_HIP)
|
| 110 |
|
| 111 |
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 112 |
#ifdef GGML_CUDA_FORCE_MMQ
|
|
|
|
| 118 |
return 64;
|
| 119 |
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 120 |
|
| 121 |
+
#endif // defined(GGML_USE_HIP)
|
| 122 |
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
| 123 |
}
|
| 124 |
|
|
|
|
| 128 |
}
|
| 129 |
|
| 130 |
static constexpr __device__ int get_mmq_y_device() {
|
| 131 |
+
#if defined(GGML_USE_HIP)
|
| 132 |
#if defined(RDNA1)
|
| 133 |
return 64;
|
| 134 |
#else
|
|
|
|
| 140 |
#else
|
| 141 |
return 64;
|
| 142 |
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 143 |
+
#endif // defined(GGML_USE_HIP)
|
| 144 |
}
|
| 145 |
|
| 146 |
// Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes.
|
|
|
|
| 250 |
}
|
| 251 |
#endif // AMD_MFMA_AVAILABLE
|
| 252 |
|
| 253 |
+
#if defined(GGML_USE_HIP)
|
| 254 |
static int mmq_get_nwarps_host(const int cc) {
|
| 255 |
return amd_mfma_available(cc) ? 8 : 4;
|
| 256 |
}
|
|
|
|
| 258 |
static int mmq_get_nwarps_host(const int /*cc*/) {
|
| 259 |
return 8;
|
| 260 |
}
|
| 261 |
+
#endif // (GGML_USE_HIP)
|
| 262 |
|
| 263 |
static constexpr __device__ int mmq_get_nwarps_device() {
|
| 264 |
+
#if defined(GGML_USE_HIP)
|
| 265 |
#if defined(AMD_MFMA_AVAILABLE)
|
| 266 |
return 8;
|
| 267 |
#else
|
|
|
|
| 269 |
#endif // AMD_MFMA_AVAILABLE
|
| 270 |
#else
|
| 271 |
return 8;
|
| 272 |
+
#endif // defined(GGML_USE_HIP)
|
| 273 |
}
|
| 274 |
|
| 275 |
// ------------------------------------------------------------
|
|
|
|
| 3047 |
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
|
| 3048 |
|
| 3049 |
template <ggml_type type, int mmq_x, bool need_check>
|
| 3050 |
+
#if defined(GGML_USE_HIP)
|
| 3051 |
#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
| 3052 |
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
|
| 3053 |
#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
|
|
|
| 3057 |
#else
|
| 3058 |
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
|
| 3059 |
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
| 3060 |
+
#endif // defined(GGML_USE_HIP)
|
| 3061 |
static __global__ void mul_mat_q(
|
| 3062 |
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
|
| 3063 |
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
|
|
|
| 3097 |
__syncthreads();
|
| 3098 |
|
| 3099 |
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
| 3100 |
+
#if (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
| 3101 |
{
|
| 3102 |
const int wt = blockIdx.z / nchannels_y;
|
| 3103 |
const int zt = blockIdx.z - wt*nchannels_y;
|
|
|
|
| 3151 |
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
|
| 3152 |
return;
|
| 3153 |
}
|
| 3154 |
+
#endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
| 3155 |
|
| 3156 |
const int64_t blocks_per_ne00 = ncols_x / qk;
|
| 3157 |
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
ggml/src/ggml-cuda/vendors/hip.h
CHANGED
|
@@ -5,10 +5,8 @@
|
|
| 5 |
#include <hipblas/hipblas.h>
|
| 6 |
#include <hip/hip_fp16.h>
|
| 7 |
#include <hip/hip_bfloat16.h>
|
| 8 |
-
#ifdef __HIP_PLATFORM_AMD__
|
| 9 |
// for rocblas_initialize()
|
| 10 |
#include "rocblas/rocblas.h"
|
| 11 |
-
#endif // __HIP_PLATFORM_AMD__
|
| 12 |
|
| 13 |
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
| 14 |
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
|
@@ -139,7 +137,7 @@
|
|
| 139 |
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
| 140 |
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
| 141 |
|
| 142 |
-
#if
|
| 143 |
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
|
| 144 |
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
|
| 145 |
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
|
|
@@ -151,7 +149,11 @@
|
|
| 151 |
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
| 152 |
#define cublasComputeType_t hipblasDatatype_t
|
| 153 |
#define cudaDataType_t hipblasDatatype_t
|
| 154 |
-
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
#define __CUDA_ARCH__ 1300
|
| 157 |
|
|
@@ -249,7 +251,7 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne
|
|
| 249 |
return c;
|
| 250 |
}
|
| 251 |
|
| 252 |
-
#if
|
| 253 |
// __shfl_xor() for half2 was added in ROCm 5.6
|
| 254 |
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
| 255 |
typedef union half2_b32 {
|
|
@@ -261,4 +263,4 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
|
|
| 261 |
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
| 262 |
return tmp.val;
|
| 263 |
}
|
| 264 |
-
#endif //
|
|
|
|
| 5 |
#include <hipblas/hipblas.h>
|
| 6 |
#include <hip/hip_fp16.h>
|
| 7 |
#include <hip/hip_bfloat16.h>
|
|
|
|
| 8 |
// for rocblas_initialize()
|
| 9 |
#include "rocblas/rocblas.h"
|
|
|
|
| 10 |
|
| 11 |
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
| 12 |
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
|
|
|
| 137 |
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
| 138 |
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
| 139 |
|
| 140 |
+
#if HIP_VERSION >= 70000000
|
| 141 |
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
|
| 142 |
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
|
| 143 |
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
|
|
|
|
| 149 |
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
| 150 |
#define cublasComputeType_t hipblasDatatype_t
|
| 151 |
#define cudaDataType_t hipblasDatatype_t
|
| 152 |
+
#endif // HIP_VERSION >= 7000000
|
| 153 |
+
|
| 154 |
+
#if !defined(__HIP_PLATFORM_AMD__)
|
| 155 |
+
#error "The HIP backend supports only AMD targets"
|
| 156 |
+
#endif // !defined(__HIP_PLATFORM_AMD__)
|
| 157 |
|
| 158 |
#define __CUDA_ARCH__ 1300
|
| 159 |
|
|
|
|
| 251 |
return c;
|
| 252 |
}
|
| 253 |
|
| 254 |
+
#if HIP_VERSION < 50600000
|
| 255 |
// __shfl_xor() for half2 was added in ROCm 5.6
|
| 256 |
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
| 257 |
typedef union half2_b32 {
|
|
|
|
| 263 |
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
| 264 |
return tmp.val;
|
| 265 |
}
|
| 266 |
+
#endif // HIP_VERSION < 50600000
|