Spaces:
Sleeping
Sleeping
Neo Zhang Jianyu
commited on
Revert "[SYCL] fix error when set main gpu to non-zero (llama/5901)" (llama/5918)
Browse files- ggml-sycl.cpp +60 -94
- ggml-sycl.h +0 -1
ggml-sycl.cpp
CHANGED
|
@@ -3559,31 +3559,12 @@ class sycl_gpu_mgr {
|
|
| 3559 |
int work_group_size = 0;
|
| 3560 |
std::string gpus_list = "";
|
| 3561 |
|
| 3562 |
-
/*
|
| 3563 |
-
Use all GPU with same top max compute units
|
| 3564 |
-
*/
|
| 3565 |
sycl_gpu_mgr() {
|
| 3566 |
detect_sycl_gpu_list_with_max_cu();
|
| 3567 |
get_allow_gpus();
|
| 3568 |
create_context_with_gpus();
|
| 3569 |
}
|
| 3570 |
|
| 3571 |
-
/*
|
| 3572 |
-
Use the assigned GPU as only one
|
| 3573 |
-
*/
|
| 3574 |
-
sycl_gpu_mgr(int main_gpu_id) {
|
| 3575 |
-
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
|
| 3576 |
-
dpct::device_info prop;
|
| 3577 |
-
dpct::get_device_info(prop, device);
|
| 3578 |
-
gpus.push_back(main_gpu_id);
|
| 3579 |
-
devices.push_back(device);
|
| 3580 |
-
work_group_size = prop.get_max_work_group_size();
|
| 3581 |
-
max_compute_units = prop.get_max_compute_units();
|
| 3582 |
-
|
| 3583 |
-
get_allow_gpus();
|
| 3584 |
-
create_context_with_gpus();
|
| 3585 |
-
}
|
| 3586 |
-
|
| 3587 |
void create_context_with_gpus() {
|
| 3588 |
sycl::context ctx = sycl::context(devices);
|
| 3589 |
assert(gpus.size() > 0);
|
|
@@ -3599,7 +3580,7 @@ class sycl_gpu_mgr {
|
|
| 3599 |
gpus_list += std::to_string(gpus[i]);
|
| 3600 |
gpus_list += ",";
|
| 3601 |
}
|
| 3602 |
-
if (gpus_list.length() >
|
| 3603 |
gpus_list.pop_back();
|
| 3604 |
}
|
| 3605 |
}
|
|
@@ -3648,8 +3629,8 @@ class sycl_gpu_mgr {
|
|
| 3648 |
if (gpus[i] == id)
|
| 3649 |
return i;
|
| 3650 |
}
|
| 3651 |
-
|
| 3652 |
-
|
| 3653 |
}
|
| 3654 |
|
| 3655 |
int get_next_index(int id) {
|
|
@@ -3658,7 +3639,8 @@ class sycl_gpu_mgr {
|
|
| 3658 |
if (gpus[i] == id)
|
| 3659 |
return i;
|
| 3660 |
}
|
| 3661 |
-
|
|
|
|
| 3662 |
}
|
| 3663 |
};
|
| 3664 |
|
|
@@ -3667,7 +3649,6 @@ static int g_device_count = -1;
|
|
| 3667 |
static int g_all_sycl_device_count = -1;
|
| 3668 |
static int g_main_device = -1;
|
| 3669 |
static int g_main_device_id = -1;
|
| 3670 |
-
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
|
| 3671 |
|
| 3672 |
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
|
| 3673 |
|
|
@@ -13244,7 +13225,7 @@ void ggml_backend_sycl_print_sycl_devices() {
|
|
| 13244 |
}
|
| 13245 |
|
| 13246 |
void print_gpu_device_list() {
|
| 13247 |
-
fprintf(stderr, "detect %d SYCL GPUs: [%s] with
|
| 13248 |
g_sycl_gpu_mgr->get_gpu_count(),
|
| 13249 |
g_sycl_gpu_mgr->gpus_list.c_str(),
|
| 13250 |
g_sycl_gpu_mgr->max_compute_units);
|
|
@@ -13283,15 +13264,6 @@ void ggml_init_sycl() try {
|
|
| 13283 |
#else
|
| 13284 |
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
| 13285 |
#endif
|
| 13286 |
-
|
| 13287 |
-
/* NOT REMOVE, keep it for next optimize for XMX.
|
| 13288 |
-
#if defined(SYCL_USE_XMX)
|
| 13289 |
-
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
|
| 13290 |
-
#else
|
| 13291 |
-
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
| 13292 |
-
#endif
|
| 13293 |
-
*/
|
| 13294 |
-
|
| 13295 |
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
| 13296 |
dpct::dev_mgr::instance().device_count()) != 0) {
|
| 13297 |
initialized = true;
|
|
@@ -13300,61 +13272,68 @@ void ggml_init_sycl() try {
|
|
| 13300 |
}
|
| 13301 |
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
| 13302 |
ggml_backend_sycl_print_sycl_devices();
|
| 13303 |
-
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
| 13304 |
-
print_gpu_device_list();
|
| 13305 |
-
initialized = true;
|
| 13306 |
-
g_sycl_loaded = true;
|
| 13307 |
-
}
|
| 13308 |
|
|
|
|
| 13309 |
|
|
|
|
|
|
|
| 13310 |
|
| 13311 |
-
|
| 13312 |
-
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
| 13313 |
|
| 13314 |
-
|
| 13315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13316 |
|
| 13317 |
-
|
| 13318 |
-
|
| 13319 |
-
|
| 13320 |
-
g_device_caps[id].cc = 0;
|
| 13321 |
-
g_tensor_split[id] = 0;
|
| 13322 |
-
g_default_tensor_split[id] = 0;
|
| 13323 |
-
}
|
| 13324 |
|
| 13325 |
-
|
| 13326 |
-
|
| 13327 |
-
|
| 13328 |
|
| 13329 |
-
|
| 13330 |
-
|
| 13331 |
-
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
| 13332 |
|
| 13333 |
-
|
| 13334 |
-
|
|
|
|
| 13335 |
|
| 13336 |
-
|
| 13337 |
-
|
| 13338 |
-
|
| 13339 |
|
| 13340 |
-
|
| 13341 |
-
|
| 13342 |
-
}
|
| 13343 |
|
| 13344 |
-
|
| 13345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13346 |
|
| 13347 |
-
|
| 13348 |
-
|
| 13349 |
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
| 13350 |
-
g_syclStreams[i][is] =
|
| 13351 |
-
dpct::get_current_device().create_queue(
|
| 13352 |
-
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
| 13353 |
}
|
| 13354 |
|
| 13355 |
-
|
| 13356 |
-
|
| 13357 |
-
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
| 13358 |
}
|
| 13359 |
}
|
| 13360 |
catch (sycl::exception const &exc) {
|
|
@@ -16753,24 +16732,22 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
|
| 16753 |
/* .is_host = */ nullptr,
|
| 16754 |
};
|
| 16755 |
|
| 16756 |
-
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int
|
| 16757 |
-
if (device_index>=g_device_count or device_index<0) {
|
| 16758 |
-
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
| 16759 |
-
device_index, g_device_count-1);
|
| 16760 |
-
GGML_ASSERT(device_index<g_device_count);
|
| 16761 |
-
}
|
| 16762 |
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
| 16763 |
|
| 16764 |
-
|
|
|
|
|
|
|
| 16765 |
for (int i = 0; i < g_device_count; i++) {
|
| 16766 |
ggml_backend_sycl_buffer_types[i] = {
|
| 16767 |
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
| 16768 |
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
|
| 16769 |
};
|
| 16770 |
}
|
| 16771 |
-
|
| 16772 |
}
|
| 16773 |
-
|
|
|
|
| 16774 |
}
|
| 16775 |
|
| 16776 |
// sycl split buffer type
|
|
@@ -17519,17 +17496,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
|
| 17519 |
return g_sycl_gpu_mgr->get_index(device_id);
|
| 17520 |
}
|
| 17521 |
|
| 17522 |
-
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu_id) {
|
| 17523 |
-
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
| 17524 |
-
printf("ggml_backend_sycl_set_single_device: use single device: %d\n", main_gpu_id);
|
| 17525 |
-
if (g_sycl_gpu_mgr) {
|
| 17526 |
-
delete g_sycl_gpu_mgr;
|
| 17527 |
-
}
|
| 17528 |
-
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
|
| 17529 |
-
ggml_init_sycl();
|
| 17530 |
-
g_ggml_backend_sycl_buffer_type_initialized = false;
|
| 17531 |
-
}
|
| 17532 |
-
|
| 17533 |
extern "C" int ggml_backend_sycl_reg_devices();
|
| 17534 |
|
| 17535 |
int ggml_backend_sycl_reg_devices() {
|
|
|
|
| 3559 |
int work_group_size = 0;
|
| 3560 |
std::string gpus_list = "";
|
| 3561 |
|
|
|
|
|
|
|
|
|
|
| 3562 |
sycl_gpu_mgr() {
|
| 3563 |
detect_sycl_gpu_list_with_max_cu();
|
| 3564 |
get_allow_gpus();
|
| 3565 |
create_context_with_gpus();
|
| 3566 |
}
|
| 3567 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3568 |
void create_context_with_gpus() {
|
| 3569 |
sycl::context ctx = sycl::context(devices);
|
| 3570 |
assert(gpus.size() > 0);
|
|
|
|
| 3580 |
gpus_list += std::to_string(gpus[i]);
|
| 3581 |
gpus_list += ",";
|
| 3582 |
}
|
| 3583 |
+
if (gpus_list.length() > 2) {
|
| 3584 |
gpus_list.pop_back();
|
| 3585 |
}
|
| 3586 |
}
|
|
|
|
| 3629 |
if (gpus[i] == id)
|
| 3630 |
return i;
|
| 3631 |
}
|
| 3632 |
+
assert(false);
|
| 3633 |
+
return -1;
|
| 3634 |
}
|
| 3635 |
|
| 3636 |
int get_next_index(int id) {
|
|
|
|
| 3639 |
if (gpus[i] == id)
|
| 3640 |
return i;
|
| 3641 |
}
|
| 3642 |
+
assert(false);
|
| 3643 |
+
return -1;
|
| 3644 |
}
|
| 3645 |
};
|
| 3646 |
|
|
|
|
| 3649 |
static int g_all_sycl_device_count = -1;
|
| 3650 |
static int g_main_device = -1;
|
| 3651 |
static int g_main_device_id = -1;
|
|
|
|
| 3652 |
|
| 3653 |
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
|
| 3654 |
|
|
|
|
| 13225 |
}
|
| 13226 |
|
| 13227 |
void print_gpu_device_list() {
|
| 13228 |
+
fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
|
| 13229 |
g_sycl_gpu_mgr->get_gpu_count(),
|
| 13230 |
g_sycl_gpu_mgr->gpus_list.c_str(),
|
| 13231 |
g_sycl_gpu_mgr->max_compute_units);
|
|
|
|
| 13264 |
#else
|
| 13265 |
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
| 13266 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13267 |
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
| 13268 |
dpct::dev_mgr::instance().device_count()) != 0) {
|
| 13269 |
initialized = true;
|
|
|
|
| 13272 |
}
|
| 13273 |
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
| 13274 |
ggml_backend_sycl_print_sycl_devices();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13275 |
|
| 13276 |
+
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
| 13277 |
|
| 13278 |
+
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
| 13279 |
+
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
| 13280 |
|
| 13281 |
+
print_gpu_device_list();
|
|
|
|
| 13282 |
|
| 13283 |
+
int64_t total_vram = 0;
|
| 13284 |
|
| 13285 |
+
/* NOT REMOVE, keep it for next optimize for XMX.
|
| 13286 |
+
#if defined(SYCL_USE_XMX)
|
| 13287 |
+
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
|
| 13288 |
+
#else
|
| 13289 |
+
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
| 13290 |
+
#endif
|
| 13291 |
+
*/
|
| 13292 |
+
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
| 13293 |
+
g_device_caps[id].vmm = 0;
|
| 13294 |
+
g_device_caps[id].device_id = -1;
|
| 13295 |
+
g_device_caps[id].cc = 0;
|
| 13296 |
+
g_tensor_split[id] = 0;
|
| 13297 |
+
g_default_tensor_split[id] = 0;
|
| 13298 |
+
}
|
| 13299 |
|
| 13300 |
+
for (int i = 0; i < g_device_count; ++i) {
|
| 13301 |
+
int device_id = g_sycl_gpu_mgr->gpus[i];
|
| 13302 |
+
g_device_caps[i].vmm = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13303 |
|
| 13304 |
+
dpct::device_info prop;
|
| 13305 |
+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
| 13306 |
+
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
| 13307 |
|
| 13308 |
+
g_default_tensor_split[i] = total_vram;
|
| 13309 |
+
total_vram += prop.get_global_mem_size();
|
|
|
|
| 13310 |
|
| 13311 |
+
g_device_caps[i].cc =
|
| 13312 |
+
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
| 13313 |
+
}
|
| 13314 |
|
| 13315 |
+
for (int i = 0; i < g_device_count; ++i) {
|
| 13316 |
+
g_default_tensor_split[i] /= total_vram;
|
| 13317 |
+
}
|
| 13318 |
|
| 13319 |
+
for (int i = 0; i < g_device_count; ++i) {
|
| 13320 |
+
SYCL_CHECK(ggml_sycl_set_device(i));
|
|
|
|
| 13321 |
|
| 13322 |
+
// create sycl streams
|
| 13323 |
+
for (int is = 0; is < MAX_STREAMS; ++is) {
|
| 13324 |
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
| 13325 |
+
g_syclStreams[i][is] =
|
| 13326 |
+
dpct::get_current_device().create_queue(
|
| 13327 |
+
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
| 13328 |
+
}
|
| 13329 |
|
| 13330 |
+
const dpct::queue_ptr stream = g_syclStreams[i][0];
|
| 13331 |
+
// create sycl handle
|
| 13332 |
+
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
|
|
|
|
|
|
|
|
|
| 13333 |
}
|
| 13334 |
|
| 13335 |
+
initialized = true;
|
| 13336 |
+
g_sycl_loaded = true;
|
|
|
|
| 13337 |
}
|
| 13338 |
}
|
| 13339 |
catch (sycl::exception const &exc) {
|
|
|
|
| 16732 |
/* .is_host = */ nullptr,
|
| 16733 |
};
|
| 16734 |
|
| 16735 |
+
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16736 |
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
| 16737 |
|
| 16738 |
+
static bool ggml_backend_sycl_buffer_type_initialized = false;
|
| 16739 |
+
|
| 16740 |
+
if (!ggml_backend_sycl_buffer_type_initialized) {
|
| 16741 |
for (int i = 0; i < g_device_count; i++) {
|
| 16742 |
ggml_backend_sycl_buffer_types[i] = {
|
| 16743 |
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
| 16744 |
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
|
| 16745 |
};
|
| 16746 |
}
|
| 16747 |
+
ggml_backend_sycl_buffer_type_initialized = true;
|
| 16748 |
}
|
| 16749 |
+
|
| 16750 |
+
return &ggml_backend_sycl_buffer_types[device];
|
| 16751 |
}
|
| 16752 |
|
| 16753 |
// sycl split buffer type
|
|
|
|
| 17496 |
return g_sycl_gpu_mgr->get_index(device_id);
|
| 17497 |
}
|
| 17498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17499 |
extern "C" int ggml_backend_sycl_reg_devices();
|
| 17500 |
|
| 17501 |
int ggml_backend_sycl_reg_devices() {
|
ggml-sycl.h
CHANGED
|
@@ -28,7 +28,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
|
|
| 28 |
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
| 29 |
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
| 30 |
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
| 31 |
-
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu);
|
| 32 |
|
| 33 |
#ifdef __cplusplus
|
| 34 |
}
|
|
|
|
| 28 |
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
| 29 |
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
| 30 |
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
|
|
|
| 31 |
|
| 32 |
#ifdef __cplusplus
|
| 33 |
}
|