uvos commited on
Commit
58a3802
·
1 Parent(s): b4896dc

HIP: bump requirement to rocm 6.1 (llama/15296)

Browse files
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -464,25 +464,21 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
464
  }
465
 
466
  static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
467
- #if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
468
  return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
469
- #elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
470
  return __hmax2(a, b);
471
- #elif !defined(GGML_USE_HIP)
472
  half2 ret;
473
  reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
474
  reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
475
  return ret;
476
- #else
477
- GGML_UNUSED(a);
478
- GGML_UNUSED(b);
479
- NO_DEVICE_CODE;
480
  #endif
481
  }
482
 
483
  template<int width = WARP_SIZE>
484
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
485
- #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
486
  #pragma unroll
487
  for (int offset = width/2; offset > 0; offset >>= 1) {
488
  x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
@@ -491,7 +487,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
491
  #else
492
  GGML_UNUSED(x);
493
  NO_DEVICE_CODE;
494
- #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
495
  }
496
 
497
  #if CUDART_VERSION < CUDART_HMASK
 
464
  }
465
 
466
  static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
467
+ #if defined(GGML_USE_HIP)
468
  return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
469
+ #elif CUDART_VERSION >= CUDART_HMAX
470
  return __hmax2(a, b);
471
+ #else
472
  half2 ret;
473
  reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
474
  reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
475
  return ret;
 
 
 
 
476
  #endif
477
  }
478
 
479
  template<int width = WARP_SIZE>
480
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
481
+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
482
  #pragma unroll
483
  for (int offset = width/2; offset > 0; offset >>= 1) {
484
  x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
 
487
  #else
488
  GGML_UNUSED(x);
489
  NO_DEVICE_CODE;
490
+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
491
  }
492
 
493
  #if CUDART_VERSION < CUDART_HMASK
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -180,30 +180,6 @@ static int ggml_cuda_parse_id(char devName[]) {
180
  #endif // defined(GGML_USE_HIP)
181
 
182
  static ggml_cuda_device_info ggml_cuda_init() {
183
- #if defined(GGML_USE_HIP)
184
- // Workaround for a rocBLAS bug when using multiple graphics cards:
185
- // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
186
- {
187
- int major_version = 0;
188
- size_t version_length = 0;
189
- if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
190
- std::vector<char> version(version_length+1, '\0');
191
- if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
192
- version.resize(::strlen(version.data()));
193
- int parsed_value = 0;
194
- if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
195
- major_version = parsed_value;
196
- }
197
- }
198
- }
199
- if (major_version < 4) {
200
- GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
201
- rocblas_initialize();
202
- CUDA_CHECK(cudaDeviceSynchronize());
203
- }
204
- }
205
- #endif
206
-
207
  ggml_cuda_device_info info = {};
208
 
209
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
 
180
  #endif // defined(GGML_USE_HIP)
181
 
182
  static ggml_cuda_device_info ggml_cuda_init() {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  ggml_cuda_device_info info = {};
184
 
185
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
ggml/src/ggml-cuda/vendors/hip.h CHANGED
@@ -5,8 +5,6 @@
5
  #include <hipblas/hipblas.h>
6
  #include <hip/hip_fp16.h>
7
  #include <hip/hip_bfloat16.h>
8
- // for rocblas_initialize()
9
- #include "rocblas/rocblas.h"
10
 
11
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
12
  #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
@@ -251,17 +249,3 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne
251
  }
252
  return c;
253
  }
254
-
255
- #if HIP_VERSION < 50600000
256
- // __shfl_xor() for half2 was added in ROCm 5.6
257
- static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
258
- typedef union half2_b32 {
259
- half2 val;
260
- int b32;
261
- } half2_b32_t;
262
- half2_b32_t tmp;
263
- tmp.val = var;
264
- tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
265
- return tmp.val;
266
- }
267
- #endif // HIP_VERSION < 50600000
 
5
  #include <hipblas/hipblas.h>
6
  #include <hip/hip_fp16.h>
7
  #include <hip/hip_bfloat16.h>
 
 
8
 
9
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
10
  #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
 
249
  }
250
  return c;
251
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-hip/CMakeLists.txt CHANGED
@@ -46,8 +46,8 @@ if (GGML_HIP_ROCWMMA_FATTN)
46
  endif()
47
  endif()
48
 
49
- if (${hip_VERSION} VERSION_LESS 5.5)
50
- message(FATAL_ERROR "At least ROCM/HIP V5.5 is required")
51
  endif()
52
 
53
  message(STATUS "HIP and hipBLAS found")
 
46
  endif()
47
  endif()
48
 
49
+ if (${hip_VERSION} VERSION_LESS 6.1)
50
+ message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
51
  endif()
52
 
53
  message(STATUS "HIP and hipBLAS found")