uvos commited on
Commit
e538e2c
·
1 Parent(s): bd93c1b

CUDA/HIP: add warp_size to cuda_device_info

Browse files
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -520,6 +520,7 @@ struct ggml_cuda_device_info {
520
  bool vmm; // virtual memory support
521
  size_t vmm_granularity; // granularity of virtual memory
522
  size_t total_vram;
 
523
  };
524
 
525
  cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
 
520
  bool vmm; // virtual memory support
521
  size_t vmm_granularity; // granularity of virtual memory
522
  size_t total_vram;
523
+ int warp_size; // Number of threads in a dispatch
524
  };
525
 
526
  cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -242,6 +242,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
242
 
243
  info.devices[id].nsm = prop.multiProcessorCount;
244
  info.devices[id].smpb = prop.sharedMemPerBlock;
 
245
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
246
  info.devices[id].smpbo = prop.sharedMemPerBlock;
247
 
@@ -256,8 +257,9 @@ static ggml_cuda_device_info ggml_cuda_init() {
256
  info.devices[id].cc += prop.minor * 0x10;
257
  }
258
  }
259
- GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s\n",
260
- id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no");
 
261
  #else
262
  info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
263
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
 
242
 
243
  info.devices[id].nsm = prop.multiProcessorCount;
244
  info.devices[id].smpb = prop.sharedMemPerBlock;
245
+ info.devices[id].warp_size = prop.warpSize;
246
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
247
  info.devices[id].smpbo = prop.sharedMemPerBlock;
248
 
 
257
  info.devices[id].cc += prop.minor * 0x10;
258
  }
259
  }
260
+ GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
261
+ id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
262
+ device_vmm ? "yes" : "no", prop.warpSize);
263
  #else
264
  info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
265
  info.devices[id].cc = 100*prop.major + 10*prop.minor;