Shawn yang Yzzzaz JohannesGaessler yangxiao Diego Devesa commited on
Commit
a75e157
·
1 Parent(s): 6fb9674

CUDA: add a prop in ggml_cuda_device_infor for distinguish iGPU or dGPU in cuda (#13856) (llama/13895)

Browse files

* 1. add "integrated" in ggml_cuda_device_info for distinguish whether it is Intergrate_gpu or discrete_gpu
2. Adjust the func:"ggml_backend_cuda_device_supports_buft" for this new feature

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Adjusted code indentation

Co-authored-by: Johannes Gäßler <[email protected]>

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Fixed incorrect setting of variable types

Co-authored-by: Johannes Gäßler <[email protected]>

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Adjusted the judgment logic

Co-authored-by: Johannes Gäßler <[email protected]>

* add a host_buft assert in case of integrated_cuda_device with func:'evaluate_and_capture_cuda_graph()'

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Add a defensive security assert

Co-authored-by: Johannes Gäßler <[email protected]>

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Adjusted the support judgment logic.

Co-authored-by: Johannes Gäßler <[email protected]>

* revoke the suggest commit changes due to it's not applicable in jetson_device

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Add parentheses to enforce operator precedence​

Co-authored-by: Diego Devesa <[email protected]>

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Fix ci bug: add a spaces

Co-authored-by: Johannes Gäßler <[email protected]>

---------

Co-authored-by: yangxiao <[email protected]>
Co-authored-by: Johannes Gäßler <[email protected]>
Co-authored-by: yangxiao <[email protected]>
Co-authored-by: Diego Devesa <[email protected]>

ggml/src/ggml-cuda/common.cuh CHANGED
@@ -635,6 +635,7 @@ struct ggml_cuda_device_info {
635
  int nsm; // number of streaming multiprocessors
636
  size_t smpb; // max. shared memory per block
637
  size_t smpbo; // max. shared memory per block (with opt-in)
 
638
  bool vmm; // virtual memory support
639
  size_t vmm_granularity; // granularity of virtual memory
640
  size_t total_vram;
 
635
  int nsm; // number of streaming multiprocessors
636
  size_t smpb; // max. shared memory per block
637
  size_t smpbo; // max. shared memory per block (with opt-in)
638
+ bool integrated; // Device is integrated as opposed to discrete
639
  bool vmm; // virtual memory support
640
  size_t vmm_granularity; // granularity of virtual memory
641
  size_t total_vram;
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
243
 
244
  info.default_tensor_split[id] = total_vram;
245
  total_vram += prop.totalGlobalMem;
246
-
247
- info.devices[id].nsm = prop.multiProcessorCount;
248
- info.devices[id].smpb = prop.sharedMemPerBlock;
249
- info.devices[id].warp_size = prop.warpSize;
250
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
251
  info.devices[id].smpbo = prop.sharedMemPerBlock;
252
 
@@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
1065
  GGML_UNUSED(buft);
1066
  }
1067
 
 
 
 
 
1068
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1069
  CUDA_CHECK(cudaFreeHost(buffer->context));
1070
  }
@@ -2641,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
2641
 
2642
  static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2643
  bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
 
 
2644
 
2645
  while (!graph_evaluated_or_captured) {
2646
  // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@@ -2659,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
2659
  if (node->src[j] != nullptr) {
2660
  assert(node->src[j]->buffer);
2661
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
2662
- ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
2663
  }
2664
  }
2665
  #endif
@@ -3266,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3266
  }
3267
 
3268
  static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3269
- return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
 
 
3270
  }
3271
 
3272
  static int64_t get_op_batch_size(const ggml_tensor * op) {
 
243
 
244
  info.default_tensor_split[id] = total_vram;
245
  total_vram += prop.totalGlobalMem;
246
+ info.devices[id].integrated = prop.integrated;
247
+ info.devices[id].nsm = prop.multiProcessorCount;
248
+ info.devices[id].smpb = prop.sharedMemPerBlock;
249
+ info.devices[id].warp_size = prop.warpSize;
250
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
251
  info.devices[id].smpbo = prop.sharedMemPerBlock;
252
 
 
1065
  GGML_UNUSED(buft);
1066
  }
1067
 
1068
+ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
1069
+ return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1070
+ }
1071
+
1072
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1073
  CUDA_CHECK(cudaFreeHost(buffer->context));
1074
  }
 
2645
 
2646
  static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2647
  bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
2648
+ // flag used to determine whether it is an integrated_gpu
2649
+ const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
2650
 
2651
  while (!graph_evaluated_or_captured) {
2652
  // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
 
2665
  if (node->src[j] != nullptr) {
2666
  assert(node->src[j]->buffer);
2667
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
2668
+ ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
2669
  }
2670
  }
2671
  #endif
 
3272
  }
3273
 
3274
  static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3275
+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
3276
+ const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
3277
+ return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
3278
  }
3279
 
3280
  static int64_t get_op_batch_size(const ggml_tensor * op) {