Spaces:
Running
CUDA: add a prop in ggml_cuda_device_infor for distinguish iGPU or dGPU in cuda (#13856) (llama/13895)
Browse files* 1. add "integrated" in ggml_cuda_device_info for distinguish whether it is Intergrate_gpu or discrete_gpu
2. Adjust the func:"ggml_backend_cuda_device_supports_buft" for this new feature
* Update ggml/src/ggml-cuda/ggml-cuda.cu
Adjusted code indentation
Co-authored-by: Johannes Gäßler <[email protected]>
* Update ggml/src/ggml-cuda/ggml-cuda.cu
Fixed incorrect setting of variable types
Co-authored-by: Johannes Gäßler <[email protected]>
* Update ggml/src/ggml-cuda/ggml-cuda.cu
Adjusted the judgment logic
Co-authored-by: Johannes Gäßler <[email protected]>
* add a host_buft assert in case of integrated_cuda_device with func:'evaluate_and_capture_cuda_graph()'
* Update ggml/src/ggml-cuda/ggml-cuda.cu
Add a defensive security assert
Co-authored-by: Johannes Gäßler <[email protected]>
* Update ggml/src/ggml-cuda/ggml-cuda.cu
Adjusted the support judgment logic.
Co-authored-by: Johannes Gäßler <[email protected]>
* revoke the suggest commit changes due to it's not applicable in jetson_device
* Update ggml/src/ggml-cuda/ggml-cuda.cu
Add parentheses to enforce operator precedence
Co-authored-by: Diego Devesa <[email protected]>
* Update ggml/src/ggml-cuda/ggml-cuda.cu
Fix ci bug: add a spaces
Co-authored-by: Johannes Gäßler <[email protected]>
---------
Co-authored-by: yangxiao <[email protected]>
Co-authored-by: Johannes Gäßler <[email protected]>
Co-authored-by: yangxiao <[email protected]>
Co-authored-by: Diego Devesa <[email protected]>
|
@@ -635,6 +635,7 @@ struct ggml_cuda_device_info {
|
|
| 635 |
int nsm; // number of streaming multiprocessors
|
| 636 |
size_t smpb; // max. shared memory per block
|
| 637 |
size_t smpbo; // max. shared memory per block (with opt-in)
|
|
|
|
| 638 |
bool vmm; // virtual memory support
|
| 639 |
size_t vmm_granularity; // granularity of virtual memory
|
| 640 |
size_t total_vram;
|
|
|
|
| 635 |
int nsm; // number of streaming multiprocessors
|
| 636 |
size_t smpb; // max. shared memory per block
|
| 637 |
size_t smpbo; // max. shared memory per block (with opt-in)
|
| 638 |
+
bool integrated; // Device is integrated as opposed to discrete
|
| 639 |
bool vmm; // virtual memory support
|
| 640 |
size_t vmm_granularity; // granularity of virtual memory
|
| 641 |
size_t total_vram;
|
|
@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 243 |
|
| 244 |
info.default_tensor_split[id] = total_vram;
|
| 245 |
total_vram += prop.totalGlobalMem;
|
| 246 |
-
|
| 247 |
-
info.devices[id].nsm
|
| 248 |
-
info.devices[id].smpb
|
| 249 |
-
info.devices[id].warp_size
|
| 250 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 251 |
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
| 252 |
|
|
@@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
|
| 1065 |
GGML_UNUSED(buft);
|
| 1066 |
}
|
| 1067 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1068 |
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1069 |
CUDA_CHECK(cudaFreeHost(buffer->context));
|
| 1070 |
}
|
|
@@ -2641,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
|
| 2641 |
|
| 2642 |
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
| 2643 |
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
|
|
|
|
|
|
| 2644 |
|
| 2645 |
while (!graph_evaluated_or_captured) {
|
| 2646 |
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
|
@@ -2659,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
| 2659 |
if (node->src[j] != nullptr) {
|
| 2660 |
assert(node->src[j]->buffer);
|
| 2661 |
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
|
| 2662 |
-
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
|
| 2663 |
}
|
| 2664 |
}
|
| 2665 |
#endif
|
|
@@ -3266,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 3266 |
}
|
| 3267 |
|
| 3268 |
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
| 3269 |
-
|
|
|
|
|
|
|
| 3270 |
}
|
| 3271 |
|
| 3272 |
static int64_t get_op_batch_size(const ggml_tensor * op) {
|
|
|
|
| 243 |
|
| 244 |
info.default_tensor_split[id] = total_vram;
|
| 245 |
total_vram += prop.totalGlobalMem;
|
| 246 |
+
info.devices[id].integrated = prop.integrated;
|
| 247 |
+
info.devices[id].nsm = prop.multiProcessorCount;
|
| 248 |
+
info.devices[id].smpb = prop.sharedMemPerBlock;
|
| 249 |
+
info.devices[id].warp_size = prop.warpSize;
|
| 250 |
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
| 251 |
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
| 252 |
|
|
|
|
| 1065 |
GGML_UNUSED(buft);
|
| 1066 |
}
|
| 1067 |
|
| 1068 |
+
static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
|
| 1069 |
+
return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
|
| 1070 |
+
}
|
| 1071 |
+
|
| 1072 |
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1073 |
CUDA_CHECK(cudaFreeHost(buffer->context));
|
| 1074 |
}
|
|
|
|
| 2645 |
|
| 2646 |
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
| 2647 |
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
| 2648 |
+
// flag used to determine whether it is an integrated_gpu
|
| 2649 |
+
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
| 2650 |
|
| 2651 |
while (!graph_evaluated_or_captured) {
|
| 2652 |
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
|
|
|
| 2665 |
if (node->src[j] != nullptr) {
|
| 2666 |
assert(node->src[j]->buffer);
|
| 2667 |
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
|
| 2668 |
+
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
|
| 2669 |
}
|
| 2670 |
}
|
| 2671 |
#endif
|
|
|
|
| 3272 |
}
|
| 3273 |
|
| 3274 |
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
| 3275 |
+
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
| 3276 |
+
const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
|
| 3277 |
+
return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
|
| 3278 |
}
|
| 3279 |
|
| 3280 |
static int64_t get_op_batch_size(const ggml_tensor * op) {
|