Spaces:
Running
Running
uvos
commited on
Commit
·
bc1c1a4
1
Parent(s):
e538e2c
HIP: Prepare reduction operators for wave 64
Browse files- ggml/src/ggml-cuda/common.cuh +26 -33
- ggml/src/ggml-cuda/ggml-cuda.cu +2 -2
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -190,53 +190,46 @@ static __device__ void no_device_code(
|
|
| 190 |
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
| 191 |
#endif // __CUDA_ARCH__
|
| 192 |
|
|
|
|
| 193 |
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
| 194 |
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 195 |
return __reduce_add_sync(0xffffffff, x);
|
| 196 |
#else
|
| 197 |
#pragma unroll
|
| 198 |
-
for (int offset =
|
| 199 |
-
x += __shfl_xor_sync(0xffffffff, x, offset,
|
| 200 |
}
|
| 201 |
return x;
|
| 202 |
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 203 |
}
|
| 204 |
|
|
|
|
| 205 |
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
| 206 |
#pragma unroll
|
| 207 |
-
for (int offset =
|
| 208 |
-
x += __shfl_xor_sync(0xffffffff, x, offset,
|
| 209 |
}
|
| 210 |
return x;
|
| 211 |
}
|
| 212 |
|
|
|
|
| 213 |
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
| 214 |
#pragma unroll
|
| 215 |
-
for (int offset =
|
| 216 |
-
a.x += __shfl_xor_sync(0xffffffff, a.x, offset,
|
| 217 |
-
a.y += __shfl_xor_sync(0xffffffff, a.y, offset,
|
| 218 |
}
|
| 219 |
return a;
|
| 220 |
}
|
| 221 |
|
|
|
|
| 222 |
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
| 223 |
#ifdef FP16_AVAILABLE
|
| 224 |
-
|
| 225 |
-
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 226 |
-
#pragma unroll
|
| 227 |
-
for (int offset = 16; offset > 0; offset >>= 1) {
|
| 228 |
-
const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32);
|
| 229 |
-
reinterpret_cast<half&>(a.x) += __low2half(a_other);
|
| 230 |
-
reinterpret_cast<half&>(a.y) += __high2half(a_other);
|
| 231 |
-
}
|
| 232 |
-
return a;
|
| 233 |
-
#else
|
| 234 |
#pragma unroll
|
| 235 |
-
for (int offset =
|
| 236 |
-
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset,
|
| 237 |
}
|
| 238 |
return a;
|
| 239 |
-
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 240 |
|
| 241 |
#else
|
| 242 |
NO_DEVICE_CODE;
|
|
@@ -244,10 +237,11 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
| 244 |
#endif // FP16_AVAILABLE
|
| 245 |
}
|
| 246 |
|
|
|
|
| 247 |
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
| 248 |
#pragma unroll
|
| 249 |
-
for (int offset =
|
| 250 |
-
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset,
|
| 251 |
}
|
| 252 |
return x;
|
| 253 |
}
|
|
@@ -269,35 +263,34 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
|
|
| 269 |
}
|
| 270 |
|
| 271 |
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
| 272 |
-
#if
|
| 273 |
-
|
| 274 |
-
#
|
| 275 |
return __hmax2(a, b);
|
| 276 |
-
#
|
| 277 |
half2 ret;
|
| 278 |
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
|
| 279 |
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
|
| 280 |
return ret;
|
| 281 |
-
#endif // CUDART_VERSION >= CUDART_HMAX
|
| 282 |
-
|
| 283 |
#else
|
| 284 |
GGML_UNUSED(a);
|
| 285 |
GGML_UNUSED(b);
|
| 286 |
NO_DEVICE_CODE;
|
| 287 |
-
#endif
|
| 288 |
}
|
| 289 |
|
|
|
|
| 290 |
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 291 |
-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
| 292 |
#pragma unroll
|
| 293 |
-
for (int offset =
|
| 294 |
-
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset,
|
| 295 |
}
|
| 296 |
return x;
|
| 297 |
#else
|
| 298 |
GGML_UNUSED(x);
|
| 299 |
NO_DEVICE_CODE;
|
| 300 |
-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
| 301 |
}
|
| 302 |
|
| 303 |
#if CUDART_VERSION < CUDART_HMASK
|
|
|
|
| 190 |
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
| 191 |
#endif // __CUDA_ARCH__
|
| 192 |
|
| 193 |
+
template<int width = WARP_SIZE>
|
| 194 |
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
| 195 |
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 196 |
return __reduce_add_sync(0xffffffff, x);
|
| 197 |
#else
|
| 198 |
#pragma unroll
|
| 199 |
+
for (int offset = width/2; offset > 0; offset >>= 1) {
|
| 200 |
+
x += __shfl_xor_sync(0xffffffff, x, offset, width);
|
| 201 |
}
|
| 202 |
return x;
|
| 203 |
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
| 204 |
}
|
| 205 |
|
| 206 |
+
template<int width = WARP_SIZE>
|
| 207 |
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
| 208 |
#pragma unroll
|
| 209 |
+
for (int offset = width/2; offset > 0; offset >>= 1) {
|
| 210 |
+
x += __shfl_xor_sync(0xffffffff, x, offset, width);
|
| 211 |
}
|
| 212 |
return x;
|
| 213 |
}
|
| 214 |
|
| 215 |
+
template<int width = WARP_SIZE>
|
| 216 |
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
| 217 |
#pragma unroll
|
| 218 |
+
for (int offset = width/2; offset > 0; offset >>= 1) {
|
| 219 |
+
a.x += __shfl_xor_sync(0xffffffff, a.x, offset, width);
|
| 220 |
+
a.y += __shfl_xor_sync(0xffffffff, a.y, offset, width);
|
| 221 |
}
|
| 222 |
return a;
|
| 223 |
}
|
| 224 |
|
| 225 |
+
template<int width = WARP_SIZE>
|
| 226 |
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
| 227 |
#ifdef FP16_AVAILABLE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
#pragma unroll
|
| 229 |
+
for (int offset = width/2; offset > 0; offset >>= 1) {
|
| 230 |
+
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width));
|
| 231 |
}
|
| 232 |
return a;
|
|
|
|
| 233 |
|
| 234 |
#else
|
| 235 |
NO_DEVICE_CODE;
|
|
|
|
| 237 |
#endif // FP16_AVAILABLE
|
| 238 |
}
|
| 239 |
|
| 240 |
+
template<int width = WARP_SIZE>
|
| 241 |
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
| 242 |
#pragma unroll
|
| 243 |
+
for (int offset = width/2; offset > 0; offset >>= 1) {
|
| 244 |
+
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width));
|
| 245 |
}
|
| 246 |
return x;
|
| 247 |
}
|
|
|
|
| 263 |
}
|
| 264 |
|
| 265 |
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
| 266 |
+
#if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
|
| 267 |
+
return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
|
| 268 |
+
#elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
|
| 269 |
return __hmax2(a, b);
|
| 270 |
+
#elif !defined(GGML_USE_HIP)
|
| 271 |
half2 ret;
|
| 272 |
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
|
| 273 |
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
|
| 274 |
return ret;
|
|
|
|
|
|
|
| 275 |
#else
|
| 276 |
GGML_UNUSED(a);
|
| 277 |
GGML_UNUSED(b);
|
| 278 |
NO_DEVICE_CODE;
|
| 279 |
+
#endif
|
| 280 |
}
|
| 281 |
|
| 282 |
+
template<int width = WARP_SIZE>
|
| 283 |
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 284 |
+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
| 285 |
#pragma unroll
|
| 286 |
+
for (int offset = width/2; offset > 0; offset >>= 1) {
|
| 287 |
+
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
|
| 288 |
}
|
| 289 |
return x;
|
| 290 |
#else
|
| 291 |
GGML_UNUSED(x);
|
| 292 |
NO_DEVICE_CODE;
|
| 293 |
+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
| 294 |
}
|
| 295 |
|
| 296 |
#if CUDART_VERSION < CUDART_HMASK
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -240,8 +240,8 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 240 |
info.default_tensor_split[id] = total_vram;
|
| 241 |
total_vram += prop.totalGlobalMem;
|
| 242 |
|
| 243 |
-
info.devices[id].nsm
|
| 244 |
-
info.devices[id].smpb
|
| 245 |
info.devices[id].warp_size = prop.warpSize;
|
| 246 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 247 |
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
|
|
|
| 240 |
info.default_tensor_split[id] = total_vram;
|
| 241 |
total_vram += prop.totalGlobalMem;
|
| 242 |
|
| 243 |
+
info.devices[id].nsm = prop.multiProcessorCount;
|
| 244 |
+
info.devices[id].smpb = prop.sharedMemPerBlock;
|
| 245 |
info.devices[id].warp_size = prop.warpSize;
|
| 246 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 247 |
info.devices[id].smpbo = prop.sharedMemPerBlock;
|