uvos commited on
Commit
bc1c1a4
·
1 Parent(s): e538e2c

HIP: Prepare reduction operators for wave 64

Browse files
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -190,53 +190,46 @@ static __device__ void no_device_code(
190
  #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
191
  #endif // __CUDA_ARCH__
192
 
 
193
  static __device__ __forceinline__ int warp_reduce_sum(int x) {
194
  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
195
  return __reduce_add_sync(0xffffffff, x);
196
  #else
197
  #pragma unroll
198
- for (int offset = 16; offset > 0; offset >>= 1) {
199
- x += __shfl_xor_sync(0xffffffff, x, offset, 32);
200
  }
201
  return x;
202
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
203
  }
204
 
 
205
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
206
  #pragma unroll
207
- for (int offset = 16; offset > 0; offset >>= 1) {
208
- x += __shfl_xor_sync(0xffffffff, x, offset, 32);
209
  }
210
  return x;
211
  }
212
 
 
213
  static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
214
  #pragma unroll
215
- for (int offset = 16; offset > 0; offset >>= 1) {
216
- a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32);
217
- a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32);
218
  }
219
  return a;
220
  }
221
 
 
222
  static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
223
  #ifdef FP16_AVAILABLE
224
-
225
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
226
- #pragma unroll
227
- for (int offset = 16; offset > 0; offset >>= 1) {
228
- const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32);
229
- reinterpret_cast<half&>(a.x) += __low2half(a_other);
230
- reinterpret_cast<half&>(a.y) += __high2half(a_other);
231
- }
232
- return a;
233
- #else
234
  #pragma unroll
235
- for (int offset = 16; offset > 0; offset >>= 1) {
236
- a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32));
237
  }
238
  return a;
239
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
240
 
241
  #else
242
  NO_DEVICE_CODE;
@@ -244,10 +237,11 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
244
  #endif // FP16_AVAILABLE
245
  }
246
 
 
247
  static __device__ __forceinline__ float warp_reduce_max(float x) {
248
  #pragma unroll
249
- for (int offset = 16; offset > 0; offset >>= 1) {
250
- x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
251
  }
252
  return x;
253
  }
@@ -269,35 +263,34 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
269
  }
270
 
271
  static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
272
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
273
-
274
- #if CUDART_VERSION >= CUDART_HMAX
275
  return __hmax2(a, b);
276
- #else
277
  half2 ret;
278
  reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
279
  reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
280
  return ret;
281
- #endif // CUDART_VERSION >= CUDART_HMAX
282
-
283
  #else
284
  GGML_UNUSED(a);
285
  GGML_UNUSED(b);
286
  NO_DEVICE_CODE;
287
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
288
  }
289
 
 
290
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
291
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
292
  #pragma unroll
293
- for (int offset = 16; offset > 0; offset >>= 1) {
294
- x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
295
  }
296
  return x;
297
  #else
298
  GGML_UNUSED(x);
299
  NO_DEVICE_CODE;
300
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
301
  }
302
 
303
  #if CUDART_VERSION < CUDART_HMASK
 
190
  #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
191
  #endif // __CUDA_ARCH__
192
 
193
+ template<int width = WARP_SIZE>
194
  static __device__ __forceinline__ int warp_reduce_sum(int x) {
195
  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
196
  return __reduce_add_sync(0xffffffff, x);
197
  #else
198
  #pragma unroll
199
+ for (int offset = width/2; offset > 0; offset >>= 1) {
200
+ x += __shfl_xor_sync(0xffffffff, x, offset, width);
201
  }
202
  return x;
203
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
204
  }
205
 
206
+ template<int width = WARP_SIZE>
207
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
208
  #pragma unroll
209
+ for (int offset = width/2; offset > 0; offset >>= 1) {
210
+ x += __shfl_xor_sync(0xffffffff, x, offset, width);
211
  }
212
  return x;
213
  }
214
 
215
+ template<int width = WARP_SIZE>
216
  static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
217
  #pragma unroll
218
+ for (int offset = width/2; offset > 0; offset >>= 1) {
219
+ a.x += __shfl_xor_sync(0xffffffff, a.x, offset, width);
220
+ a.y += __shfl_xor_sync(0xffffffff, a.y, offset, width);
221
  }
222
  return a;
223
  }
224
 
225
+ template<int width = WARP_SIZE>
226
  static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
227
  #ifdef FP16_AVAILABLE
 
 
 
 
 
 
 
 
 
 
228
  #pragma unroll
229
+ for (int offset = width/2; offset > 0; offset >>= 1) {
230
+ a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width));
231
  }
232
  return a;
 
233
 
234
  #else
235
  NO_DEVICE_CODE;
 
237
  #endif // FP16_AVAILABLE
238
  }
239
 
240
+ template<int width = WARP_SIZE>
241
  static __device__ __forceinline__ float warp_reduce_max(float x) {
242
  #pragma unroll
243
+ for (int offset = width/2; offset > 0; offset >>= 1) {
244
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width));
245
  }
246
  return x;
247
  }
 
263
  }
264
 
265
  static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
266
+ #if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
267
+ return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
268
+ #elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
269
  return __hmax2(a, b);
270
+ #elif !defined(GGML_USE_HIP)
271
  half2 ret;
272
  reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
273
  reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
274
  return ret;
 
 
275
  #else
276
  GGML_UNUSED(a);
277
  GGML_UNUSED(b);
278
  NO_DEVICE_CODE;
279
+ #endif
280
  }
281
 
282
+ template<int width = WARP_SIZE>
283
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
284
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
285
  #pragma unroll
286
+ for (int offset = width/2; offset > 0; offset >>= 1) {
287
+ x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
288
  }
289
  return x;
290
  #else
291
  GGML_UNUSED(x);
292
  NO_DEVICE_CODE;
293
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
294
  }
295
 
296
  #if CUDART_VERSION < CUDART_HMASK
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -240,8 +240,8 @@ static ggml_cuda_device_info ggml_cuda_init() {
240
  info.default_tensor_split[id] = total_vram;
241
  total_vram += prop.totalGlobalMem;
242
 
243
- info.devices[id].nsm = prop.multiProcessorCount;
244
- info.devices[id].smpb = prop.sharedMemPerBlock;
245
  info.devices[id].warp_size = prop.warpSize;
246
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
247
  info.devices[id].smpbo = prop.sharedMemPerBlock;
 
240
  info.default_tensor_split[id] = total_vram;
241
  total_vram += prop.totalGlobalMem;
242
 
243
+ info.devices[id].nsm = prop.multiProcessorCount;
244
+ info.devices[id].smpb = prop.sharedMemPerBlock;
245
  info.devices[id].warp_size = prop.warpSize;
246
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
247
  info.devices[id].smpbo = prop.sharedMemPerBlock;