Spaces:
Running
Running
HIP: bump requirement to rocm 6.1 (llama/15296)
Browse files
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -464,25 +464,21 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
|
|
| 464 |
}
|
| 465 |
|
| 466 |
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
| 467 |
-
#if defined(GGML_USE_HIP)
|
| 468 |
return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
|
| 469 |
-
#elif
|
| 470 |
return __hmax2(a, b);
|
| 471 |
-
#
|
| 472 |
half2 ret;
|
| 473 |
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
|
| 474 |
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
|
| 475 |
return ret;
|
| 476 |
-
#else
|
| 477 |
-
GGML_UNUSED(a);
|
| 478 |
-
GGML_UNUSED(b);
|
| 479 |
-
NO_DEVICE_CODE;
|
| 480 |
#endif
|
| 481 |
}
|
| 482 |
|
| 483 |
template<int width = WARP_SIZE>
|
| 484 |
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 485 |
-
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL ||
|
| 486 |
#pragma unroll
|
| 487 |
for (int offset = width/2; offset > 0; offset >>= 1) {
|
| 488 |
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
|
|
@@ -491,7 +487,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|
| 491 |
#else
|
| 492 |
GGML_UNUSED(x);
|
| 493 |
NO_DEVICE_CODE;
|
| 494 |
-
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL ||
|
| 495 |
}
|
| 496 |
|
| 497 |
#if CUDART_VERSION < CUDART_HMASK
|
|
|
|
| 464 |
}
|
| 465 |
|
| 466 |
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
| 467 |
+
#if defined(GGML_USE_HIP)
|
| 468 |
return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
|
| 469 |
+
#elif CUDART_VERSION >= CUDART_HMAX
|
| 470 |
return __hmax2(a, b);
|
| 471 |
+
#else
|
| 472 |
half2 ret;
|
| 473 |
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
|
| 474 |
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
|
| 475 |
return ret;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
#endif
|
| 477 |
}
|
| 478 |
|
| 479 |
template<int width = WARP_SIZE>
|
| 480 |
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 481 |
+
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
|
| 482 |
#pragma unroll
|
| 483 |
for (int offset = width/2; offset > 0; offset >>= 1) {
|
| 484 |
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
|
|
|
|
| 487 |
#else
|
| 488 |
GGML_UNUSED(x);
|
| 489 |
NO_DEVICE_CODE;
|
| 490 |
+
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
|
| 491 |
}
|
| 492 |
|
| 493 |
#if CUDART_VERSION < CUDART_HMASK
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -180,30 +180,6 @@ static int ggml_cuda_parse_id(char devName[]) {
|
|
| 180 |
#endif // defined(GGML_USE_HIP)
|
| 181 |
|
| 182 |
static ggml_cuda_device_info ggml_cuda_init() {
|
| 183 |
-
#if defined(GGML_USE_HIP)
|
| 184 |
-
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
| 185 |
-
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
| 186 |
-
{
|
| 187 |
-
int major_version = 0;
|
| 188 |
-
size_t version_length = 0;
|
| 189 |
-
if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
|
| 190 |
-
std::vector<char> version(version_length+1, '\0');
|
| 191 |
-
if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
|
| 192 |
-
version.resize(::strlen(version.data()));
|
| 193 |
-
int parsed_value = 0;
|
| 194 |
-
if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
|
| 195 |
-
major_version = parsed_value;
|
| 196 |
-
}
|
| 197 |
-
}
|
| 198 |
-
}
|
| 199 |
-
if (major_version < 4) {
|
| 200 |
-
GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
|
| 201 |
-
rocblas_initialize();
|
| 202 |
-
CUDA_CHECK(cudaDeviceSynchronize());
|
| 203 |
-
}
|
| 204 |
-
}
|
| 205 |
-
#endif
|
| 206 |
-
|
| 207 |
ggml_cuda_device_info info = {};
|
| 208 |
|
| 209 |
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
|
|
|
| 180 |
#endif // defined(GGML_USE_HIP)
|
| 181 |
|
| 182 |
static ggml_cuda_device_info ggml_cuda_init() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
ggml_cuda_device_info info = {};
|
| 184 |
|
| 185 |
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
ggml/src/ggml-cuda/vendors/hip.h
CHANGED
|
@@ -5,8 +5,6 @@
|
|
| 5 |
#include <hipblas/hipblas.h>
|
| 6 |
#include <hip/hip_fp16.h>
|
| 7 |
#include <hip/hip_bfloat16.h>
|
| 8 |
-
// for rocblas_initialize()
|
| 9 |
-
#include "rocblas/rocblas.h"
|
| 10 |
|
| 11 |
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
| 12 |
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
|
@@ -251,17 +249,3 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne
|
|
| 251 |
}
|
| 252 |
return c;
|
| 253 |
}
|
| 254 |
-
|
| 255 |
-
#if HIP_VERSION < 50600000
|
| 256 |
-
// __shfl_xor() for half2 was added in ROCm 5.6
|
| 257 |
-
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
| 258 |
-
typedef union half2_b32 {
|
| 259 |
-
half2 val;
|
| 260 |
-
int b32;
|
| 261 |
-
} half2_b32_t;
|
| 262 |
-
half2_b32_t tmp;
|
| 263 |
-
tmp.val = var;
|
| 264 |
-
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
| 265 |
-
return tmp.val;
|
| 266 |
-
}
|
| 267 |
-
#endif // HIP_VERSION < 50600000
|
|
|
|
| 5 |
#include <hipblas/hipblas.h>
|
| 6 |
#include <hip/hip_fp16.h>
|
| 7 |
#include <hip/hip_bfloat16.h>
|
|
|
|
|
|
|
| 8 |
|
| 9 |
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
| 10 |
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
|
|
|
| 249 |
}
|
| 250 |
return c;
|
| 251 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml/src/ggml-hip/CMakeLists.txt
CHANGED
|
@@ -46,8 +46,8 @@ if (GGML_HIP_ROCWMMA_FATTN)
|
|
| 46 |
endif()
|
| 47 |
endif()
|
| 48 |
|
| 49 |
-
if (${hip_VERSION} VERSION_LESS
|
| 50 |
-
message(FATAL_ERROR "At least ROCM/HIP
|
| 51 |
endif()
|
| 52 |
|
| 53 |
message(STATUS "HIP and hipBLAS found")
|
|
|
|
| 46 |
endif()
|
| 47 |
endif()
|
| 48 |
|
| 49 |
+
if (${hip_VERSION} VERSION_LESS 6.1)
|
| 50 |
+
message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
|
| 51 |
endif()
|
| 52 |
|
| 53 |
message(STATUS "HIP and hipBLAS found")
|