Spaces:
Running
Running
Diego Devesa
commited on
Commit
·
9d74d85
1
Parent(s):
5239c28
ggml-backend : add device and backend reg interfaces (llama/9707)
Browse filesAlso:
- metal : fix compute pass descriptor autorelease crash
- ggml-backend : add device description to CPU backend
- ggml: unify backend logging mechanism
- ggml/include/ggml-backend.h +1 -4
- ggml/include/ggml-cann.h +0 -11
- ggml/include/ggml-cuda.h +0 -2
- ggml/include/ggml-metal.h +0 -2
- ggml/include/ggml.h +4 -0
- ggml/src/ggml-backend-impl.h +0 -3
- ggml/src/ggml-backend.cpp +83 -19
- ggml/src/ggml-cann.cpp +13 -76
- ggml/src/ggml-cuda.cu +29 -73
- ggml/src/ggml-impl.h +15 -0
- ggml/src/ggml-metal.m +62 -116
- ggml/src/ggml.c +67 -25
ggml/include/ggml-backend.h
CHANGED
|
@@ -164,7 +164,7 @@ extern "C" {
|
|
| 164 |
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
|
| 165 |
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
| 166 |
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
| 167 |
-
|
| 168 |
|
| 169 |
// Functions that may be obtained using ggml_backend_reg_get_proc_address
|
| 170 |
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
|
|
@@ -184,9 +184,6 @@ extern "C" {
|
|
| 184 |
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
|
| 185 |
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
|
| 186 |
|
| 187 |
-
// Set the log callback for all registered backends
|
| 188 |
-
GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data);
|
| 189 |
-
|
| 190 |
// Direct backend (stream) initialization
|
| 191 |
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
|
| 192 |
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
|
|
|
| 164 |
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
|
| 165 |
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
| 166 |
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
| 167 |
+
|
| 168 |
|
| 169 |
// Functions that may be obtained using ggml_backend_reg_get_proc_address
|
| 170 |
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
|
|
|
|
| 184 |
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
|
| 185 |
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
|
| 186 |
|
|
|
|
|
|
|
|
|
|
| 187 |
// Direct backend (stream) initialization
|
| 188 |
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
|
| 189 |
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
ggml/include/ggml-cann.h
CHANGED
|
@@ -116,17 +116,6 @@ GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
|
|
| 116 |
size_t* free,
|
| 117 |
size_t* total);
|
| 118 |
|
| 119 |
-
/**
|
| 120 |
-
* @brief Set the logging callback for GGML.
|
| 121 |
-
*
|
| 122 |
-
* This function sets the logging callback and user data for logging.
|
| 123 |
-
*
|
| 124 |
-
* @param log_callback The logging callback to set.
|
| 125 |
-
* @param user_data User data to pass to the logging callback.
|
| 126 |
-
*/
|
| 127 |
-
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
|
| 128 |
-
void* user_data);
|
| 129 |
-
|
| 130 |
#ifdef __cplusplus
|
| 131 |
}
|
| 132 |
#endif
|
|
|
|
| 116 |
size_t* free,
|
| 117 |
size_t* total);
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
#ifdef __cplusplus
|
| 120 |
}
|
| 121 |
#endif
|
ggml/include/ggml-cuda.h
CHANGED
|
@@ -40,8 +40,6 @@ GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, siz
|
|
| 40 |
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
| 41 |
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
| 42 |
|
| 43 |
-
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
| 44 |
-
|
| 45 |
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
| 46 |
|
| 47 |
#ifdef __cplusplus
|
|
|
|
| 40 |
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
| 41 |
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
| 42 |
|
|
|
|
|
|
|
| 43 |
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
| 44 |
|
| 45 |
#ifdef __cplusplus
|
ggml/include/ggml-metal.h
CHANGED
|
@@ -39,8 +39,6 @@ extern "C" {
|
|
| 39 |
// user-code should use only these functions
|
| 40 |
//
|
| 41 |
|
| 42 |
-
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
| 43 |
-
|
| 44 |
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
| 45 |
|
| 46 |
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
|
|
|
| 39 |
// user-code should use only these functions
|
| 40 |
//
|
| 41 |
|
|
|
|
|
|
|
| 42 |
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
| 43 |
|
| 44 |
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
ggml/include/ggml.h
CHANGED
|
@@ -2174,6 +2174,10 @@ extern "C" {
|
|
| 2174 |
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
| 2175 |
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
| 2176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2177 |
// optimization parameters
|
| 2178 |
//
|
| 2179 |
// see ggml.c (ggml_opt_default_params) for default values
|
|
|
|
| 2174 |
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
| 2175 |
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
| 2176 |
|
| 2177 |
+
// Set callback for all future logging events.
|
| 2178 |
+
// If this is not called, or NULL is supplied, everything is output on stderr.
|
| 2179 |
+
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
| 2180 |
+
|
| 2181 |
// optimization parameters
|
| 2182 |
//
|
| 2183 |
// see ggml.c (ggml_opt_default_params) for default values
|
ggml/src/ggml-backend-impl.h
CHANGED
|
@@ -215,9 +215,6 @@ extern "C" {
|
|
| 215 |
// (optional) get a pointer to a function in the backend
|
| 216 |
// backends can add custom functions that are not part of the standard ggml-backend interface
|
| 217 |
void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
|
| 218 |
-
|
| 219 |
-
// (optional) set the log callback for the backend
|
| 220 |
-
void (*set_log_callback)(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data);
|
| 221 |
};
|
| 222 |
|
| 223 |
struct ggml_backend_reg {
|
|
|
|
| 215 |
// (optional) get a pointer to a function in the backend
|
| 216 |
// backends can add custom functions that are not part of the standard ggml-backend interface
|
| 217 |
void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
|
|
|
|
|
|
|
|
|
|
| 218 |
};
|
| 219 |
|
| 220 |
struct ggml_backend_reg {
|
ggml/src/ggml-backend.cpp
CHANGED
|
@@ -1,5 +1,13 @@
|
|
| 1 |
// Note: porting this file to C++ is a work in progress
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
#include "ggml-backend-impl.h"
|
| 4 |
#include "ggml-alloc.h"
|
| 5 |
#include "ggml-impl.h"
|
|
@@ -10,9 +18,15 @@
|
|
| 10 |
#include <stdio.h>
|
| 11 |
#include <stdlib.h>
|
| 12 |
#include <string.h>
|
| 13 |
-
|
| 14 |
#include <vector>
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
// backend buffer type
|
| 17 |
|
| 18 |
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
@@ -505,12 +519,6 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
|
|
| 505 |
return reg->iface.get_proc_address(reg, name);
|
| 506 |
}
|
| 507 |
|
| 508 |
-
void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
|
| 509 |
-
if (reg->iface.set_log_callback) {
|
| 510 |
-
reg->iface.set_log_callback(reg, log_callback, user_data);
|
| 511 |
-
}
|
| 512 |
-
}
|
| 513 |
-
|
| 514 |
// Backend registry
|
| 515 |
|
| 516 |
#ifdef GGML_USE_CUDA
|
|
@@ -614,13 +622,6 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
|
| 614 |
return NULL;
|
| 615 |
}
|
| 616 |
|
| 617 |
-
void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data) {
|
| 618 |
-
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
| 619 |
-
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
| 620 |
-
ggml_backend_reg_set_log_callback(reg, log_callback, user_data);
|
| 621 |
-
}
|
| 622 |
-
}
|
| 623 |
-
|
| 624 |
// Convenience functions
|
| 625 |
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
| 626 |
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
|
@@ -1021,6 +1022,70 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
|
|
| 1021 |
|
| 1022 |
////////////////////////
|
| 1023 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1024 |
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
|
| 1025 |
return "CPU";
|
| 1026 |
|
|
@@ -1028,10 +1093,9 @@ static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
|
|
| 1028 |
}
|
| 1029 |
|
| 1030 |
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
|
| 1031 |
-
|
| 1032 |
-
return "CPU";
|
| 1033 |
|
| 1034 |
-
|
| 1035 |
}
|
| 1036 |
|
| 1037 |
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
@@ -1144,10 +1208,11 @@ static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
|
|
| 1144 |
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
| 1145 |
GGML_ASSERT(index == 0);
|
| 1146 |
|
|
|
|
| 1147 |
static ggml_backend_device ggml_backend_cpu_device = {
|
| 1148 |
/* .iface = */ ggml_backend_cpu_device_i,
|
| 1149 |
/* .reg = */ reg,
|
| 1150 |
-
/* .context = */
|
| 1151 |
};
|
| 1152 |
|
| 1153 |
return &ggml_backend_cpu_device;
|
|
@@ -1161,7 +1226,6 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
|
|
| 1161 |
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
|
| 1162 |
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
| 1163 |
/* .get_proc_address = */ NULL,
|
| 1164 |
-
/* .set_log_callback = */ NULL,
|
| 1165 |
};
|
| 1166 |
|
| 1167 |
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
|
|
|
| 1 |
// Note: porting this file to C++ is a work in progress
|
| 2 |
|
| 3 |
+
#ifdef _WIN32
|
| 4 |
+
#define WIN32_LEAN_AND_MEAN
|
| 5 |
+
#ifndef NOMINMAX
|
| 6 |
+
# define NOMINMAX
|
| 7 |
+
#endif
|
| 8 |
+
#include <windows.h>
|
| 9 |
+
#endif
|
| 10 |
+
|
| 11 |
#include "ggml-backend-impl.h"
|
| 12 |
#include "ggml-alloc.h"
|
| 13 |
#include "ggml-impl.h"
|
|
|
|
| 18 |
#include <stdio.h>
|
| 19 |
#include <stdlib.h>
|
| 20 |
#include <string.h>
|
| 21 |
+
#include <string>
|
| 22 |
#include <vector>
|
| 23 |
|
| 24 |
+
#ifdef __APPLE__
|
| 25 |
+
#include <sys/types.h>
|
| 26 |
+
#include <sys/sysctl.h>
|
| 27 |
+
#endif
|
| 28 |
+
|
| 29 |
+
|
| 30 |
// backend buffer type
|
| 31 |
|
| 32 |
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
|
|
| 519 |
return reg->iface.get_proc_address(reg, name);
|
| 520 |
}
|
| 521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
// Backend registry
|
| 523 |
|
| 524 |
#ifdef GGML_USE_CUDA
|
|
|
|
| 622 |
return NULL;
|
| 623 |
}
|
| 624 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 625 |
// Convenience functions
|
| 626 |
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
| 627 |
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
|
|
|
| 1022 |
|
| 1023 |
////////////////////////
|
| 1024 |
|
| 1025 |
+
struct ggml_backend_cpu_device_context {
|
| 1026 |
+
std::string description = "CPU";
|
| 1027 |
+
|
| 1028 |
+
ggml_backend_cpu_device_context() {
|
| 1029 |
+
#ifdef __APPLE__
|
| 1030 |
+
size_t len = 0;
|
| 1031 |
+
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
| 1032 |
+
description.resize(len);
|
| 1033 |
+
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
| 1034 |
+
}
|
| 1035 |
+
#elif defined(__linux__)
|
| 1036 |
+
FILE * f = fopen("/proc/cpuinfo", "r");
|
| 1037 |
+
if (f) {
|
| 1038 |
+
char buf[1024];
|
| 1039 |
+
while (fgets(buf, sizeof(buf), f)) {
|
| 1040 |
+
if (strncmp(buf, "model name", 10) == 0) {
|
| 1041 |
+
char * p = strchr(buf, ':');
|
| 1042 |
+
if (p) {
|
| 1043 |
+
p++;
|
| 1044 |
+
while (std::isspace(*p)) {
|
| 1045 |
+
p++;
|
| 1046 |
+
}
|
| 1047 |
+
while (std::isspace(p[strlen(p) - 1])) {
|
| 1048 |
+
p[strlen(p) - 1] = '\0';
|
| 1049 |
+
}
|
| 1050 |
+
description = p;
|
| 1051 |
+
break;
|
| 1052 |
+
}
|
| 1053 |
+
}
|
| 1054 |
+
}
|
| 1055 |
+
fclose(f);
|
| 1056 |
+
}
|
| 1057 |
+
#elif defined(_WIN32)
|
| 1058 |
+
HKEY hKey;
|
| 1059 |
+
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
| 1060 |
+
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
| 1061 |
+
0,
|
| 1062 |
+
KEY_READ,
|
| 1063 |
+
&hKey) == ERROR_SUCCESS) {
|
| 1064 |
+
DWORD cpu_brand_size = 0;
|
| 1065 |
+
if (RegQueryValueExA(hKey,
|
| 1066 |
+
TEXT("ProcessorNameString"),
|
| 1067 |
+
NULL,
|
| 1068 |
+
NULL,
|
| 1069 |
+
NULL,
|
| 1070 |
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
| 1071 |
+
description.resize(cpu_brand_size);
|
| 1072 |
+
if (RegQueryValueExA(hKey,
|
| 1073 |
+
TEXT("ProcessorNameString"),
|
| 1074 |
+
NULL,
|
| 1075 |
+
NULL,
|
| 1076 |
+
(LPBYTE)&description[0], // NOLINT
|
| 1077 |
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
| 1078 |
+
if (description.find('\0') != std::string::npos) {
|
| 1079 |
+
description.resize(description.find('\0'));
|
| 1080 |
+
}
|
| 1081 |
+
}
|
| 1082 |
+
}
|
| 1083 |
+
RegCloseKey(hKey);
|
| 1084 |
+
}
|
| 1085 |
+
#endif
|
| 1086 |
+
}
|
| 1087 |
+
};
|
| 1088 |
+
|
| 1089 |
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
|
| 1090 |
return "CPU";
|
| 1091 |
|
|
|
|
| 1093 |
}
|
| 1094 |
|
| 1095 |
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
|
| 1096 |
+
struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
|
|
|
|
| 1097 |
|
| 1098 |
+
return ctx->description.c_str();
|
| 1099 |
}
|
| 1100 |
|
| 1101 |
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
|
|
| 1208 |
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
| 1209 |
GGML_ASSERT(index == 0);
|
| 1210 |
|
| 1211 |
+
static ggml_backend_cpu_device_context ctx;
|
| 1212 |
static ggml_backend_device ggml_backend_cpu_device = {
|
| 1213 |
/* .iface = */ ggml_backend_cpu_device_i,
|
| 1214 |
/* .reg = */ reg,
|
| 1215 |
+
/* .context = */ &ctx,
|
| 1216 |
};
|
| 1217 |
|
| 1218 |
return &ggml_backend_cpu_device;
|
|
|
|
| 1226 |
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
|
| 1227 |
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
| 1228 |
/* .get_proc_address = */ NULL,
|
|
|
|
| 1229 |
};
|
| 1230 |
|
| 1231 |
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
ggml/src/ggml-cann.cpp
CHANGED
|
@@ -39,69 +39,6 @@
|
|
| 39 |
|
| 40 |
#include "ggml-common.h"
|
| 41 |
|
| 42 |
-
/**
|
| 43 |
-
* @brief Default logging callback for GGML.
|
| 44 |
-
*
|
| 45 |
-
* This function is the default logging callback that logs messages to stderr.
|
| 46 |
-
*
|
| 47 |
-
* @param level The log level.
|
| 48 |
-
* @param msg The log message.
|
| 49 |
-
* @param user_data User data passed to the callback.
|
| 50 |
-
*/
|
| 51 |
-
static void ggml_cann_default_log_callback(enum ggml_log_level level,
|
| 52 |
-
const char* msg, void* user_data) {
|
| 53 |
-
GGML_UNUSED(level);
|
| 54 |
-
GGML_UNUSED(user_data);
|
| 55 |
-
fprintf(stderr, "%s", msg);
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
-
ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback;
|
| 59 |
-
void* ggml_cann_log_user_data = NULL;
|
| 60 |
-
|
| 61 |
-
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
|
| 62 |
-
void* user_data) {
|
| 63 |
-
ggml_cann_log_callback = log_callback;
|
| 64 |
-
ggml_cann_log_user_data = user_data;
|
| 65 |
-
}
|
| 66 |
-
|
| 67 |
-
#define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
| 68 |
-
#define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
| 69 |
-
#define GGML_CANN_LOG_ERROR(...) \
|
| 70 |
-
ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
| 71 |
-
|
| 72 |
-
GGML_ATTRIBUTE_FORMAT(2, 3)
|
| 73 |
-
|
| 74 |
-
/**
|
| 75 |
-
* @brief Log a message using the current logging callback.
|
| 76 |
-
*
|
| 77 |
-
* This function formats a log message and passes it to the current logging
|
| 78 |
-
* callback.
|
| 79 |
-
*
|
| 80 |
-
* @param level The log level.
|
| 81 |
-
* @param format The format string for the log message.
|
| 82 |
-
* @param ... The arguments for the format string.
|
| 83 |
-
*/
|
| 84 |
-
static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
|
| 85 |
-
if (ggml_cann_log_callback != NULL) {
|
| 86 |
-
va_list args;
|
| 87 |
-
va_start(args, format);
|
| 88 |
-
char buffer[128];
|
| 89 |
-
int len = vsnprintf(buffer, 128, format, args);
|
| 90 |
-
if (len < 128) {
|
| 91 |
-
ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data);
|
| 92 |
-
} else {
|
| 93 |
-
// vsnprintf adds a null terminator
|
| 94 |
-
std::vector<char> buffer2(len + 1);
|
| 95 |
-
va_end(args);
|
| 96 |
-
va_start(args, format);
|
| 97 |
-
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
| 98 |
-
ggml_cann_log_callback(level, buffer2.data(),
|
| 99 |
-
ggml_cann_log_user_data);
|
| 100 |
-
}
|
| 101 |
-
va_end(args);
|
| 102 |
-
}
|
| 103 |
-
}
|
| 104 |
-
|
| 105 |
/**
|
| 106 |
* @brief Handles CANN errors by printing an error message and aborting.
|
| 107 |
*
|
|
@@ -116,10 +53,10 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
|
|
| 116 |
int32_t id = -1;
|
| 117 |
aclrtGetDevice(&id);
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
file, line);
|
| 122 |
-
|
| 123 |
// abort with GGML_ASSERT to get a stack trace
|
| 124 |
GGML_ABORT("CANN error");
|
| 125 |
}
|
|
@@ -165,7 +102,7 @@ static ggml_cann_device_info ggml_cann_init() {
|
|
| 165 |
aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
|
| 166 |
|
| 167 |
if (err != ACL_SUCCESS) {
|
| 168 |
-
|
| 169 |
__func__, aclGetRecentErrMsg());
|
| 170 |
return info;
|
| 171 |
}
|
|
@@ -315,7 +252,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
|
| 315 |
*actual_size = look_ahead_size;
|
| 316 |
pool_size += look_ahead_size;
|
| 317 |
#ifdef DEBUG_CANN_MALLOC
|
| 318 |
-
|
| 319 |
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
| 320 |
"requested %u MB\n",
|
| 321 |
__func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
|
|
@@ -470,7 +407,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
| 470 |
// add to the pool
|
| 471 |
pool_size += reserve_size;
|
| 472 |
|
| 473 |
-
//
|
| 474 |
// reserved %llu MB)\n",
|
| 475 |
// device, (unsigned long long) (pool_size/1024/1024),
|
| 476 |
// (unsigned long long) (reserve_size/1024/1024));
|
|
@@ -483,7 +420,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
| 483 |
pool_used += size;
|
| 484 |
|
| 485 |
#ifdef DEBUG_CANN_MALLOC
|
| 486 |
-
|
| 487 |
(unsigned long long)size, (unsigned long long)ptr);
|
| 488 |
#endif
|
| 489 |
return ptr;
|
|
@@ -497,7 +434,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
| 497 |
*/
|
| 498 |
void free(void* ptr, size_t size) override {
|
| 499 |
#ifdef DEBUG_CANN_MALLOC
|
| 500 |
-
|
| 501 |
(unsigned long long)size, (unsigned long long)ptr);
|
| 502 |
#endif
|
| 503 |
|
|
@@ -1095,7 +1032,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
| 1095 |
void* dev_ptr;
|
| 1096 |
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
| 1097 |
if (err != ACL_SUCCESS) {
|
| 1098 |
-
|
| 1099 |
"%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
|
| 1100 |
__func__, size / 1024.0 / 1024.0, buft_ctx->device,
|
| 1101 |
aclGetRecentErrMsg());
|
|
@@ -1280,7 +1217,7 @@ static void * ggml_cann_host_malloc(size_t size) {
|
|
| 1280 |
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
| 1281 |
if (err != ACL_SUCCESS) {
|
| 1282 |
|
| 1283 |
-
|
| 1284 |
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
| 1285 |
return nullptr;
|
| 1286 |
}
|
|
@@ -1733,7 +1670,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
|
|
| 1733 |
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
| 1734 |
|
| 1735 |
if (!ok) {
|
| 1736 |
-
|
| 1737 |
node->name, ggml_op_name(node->op));
|
| 1738 |
}
|
| 1739 |
GGML_ASSERT(ok);
|
|
@@ -2043,13 +1980,13 @@ static ggml_guid_t ggml_backend_cann_guid() {
|
|
| 2043 |
ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
| 2044 |
aclInit(nullptr);
|
| 2045 |
if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
|
| 2046 |
-
|
| 2047 |
return nullptr;
|
| 2048 |
}
|
| 2049 |
|
| 2050 |
ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
|
| 2051 |
if (ctx == nullptr) {
|
| 2052 |
-
|
| 2053 |
return nullptr;
|
| 2054 |
}
|
| 2055 |
ggml_cann_set_device(ctx->device);
|
|
|
|
| 39 |
|
| 40 |
#include "ggml-common.h"
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
/**
|
| 43 |
* @brief Handles CANN errors by printing an error message and aborting.
|
| 44 |
*
|
|
|
|
| 53 |
int32_t id = -1;
|
| 54 |
aclrtGetDevice(&id);
|
| 55 |
|
| 56 |
+
GGML_LOG_ERROR("CANN error: %s\n", msg);
|
| 57 |
+
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
|
| 58 |
file, line);
|
| 59 |
+
GGML_LOG_ERROR(" %s\n", stmt);
|
| 60 |
// abort with GGML_ASSERT to get a stack trace
|
| 61 |
GGML_ABORT("CANN error");
|
| 62 |
}
|
|
|
|
| 102 |
aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
|
| 103 |
|
| 104 |
if (err != ACL_SUCCESS) {
|
| 105 |
+
GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
|
| 106 |
__func__, aclGetRecentErrMsg());
|
| 107 |
return info;
|
| 108 |
}
|
|
|
|
| 252 |
*actual_size = look_ahead_size;
|
| 253 |
pool_size += look_ahead_size;
|
| 254 |
#ifdef DEBUG_CANN_MALLOC
|
| 255 |
+
GGML_LOG_INFO(
|
| 256 |
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
| 257 |
"requested %u MB\n",
|
| 258 |
__func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
|
|
|
|
| 407 |
// add to the pool
|
| 408 |
pool_size += reserve_size;
|
| 409 |
|
| 410 |
+
// GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
|
| 411 |
// reserved %llu MB)\n",
|
| 412 |
// device, (unsigned long long) (pool_size/1024/1024),
|
| 413 |
// (unsigned long long) (reserve_size/1024/1024));
|
|
|
|
| 420 |
pool_used += size;
|
| 421 |
|
| 422 |
#ifdef DEBUG_CANN_MALLOC
|
| 423 |
+
GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
|
| 424 |
(unsigned long long)size, (unsigned long long)ptr);
|
| 425 |
#endif
|
| 426 |
return ptr;
|
|
|
|
| 434 |
*/
|
| 435 |
void free(void* ptr, size_t size) override {
|
| 436 |
#ifdef DEBUG_CANN_MALLOC
|
| 437 |
+
GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
|
| 438 |
(unsigned long long)size, (unsigned long long)ptr);
|
| 439 |
#endif
|
| 440 |
|
|
|
|
| 1032 |
void* dev_ptr;
|
| 1033 |
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
| 1034 |
if (err != ACL_SUCCESS) {
|
| 1035 |
+
GGML_LOG_ERROR(
|
| 1036 |
"%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
|
| 1037 |
__func__, size / 1024.0 / 1024.0, buft_ctx->device,
|
| 1038 |
aclGetRecentErrMsg());
|
|
|
|
| 1217 |
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
| 1218 |
if (err != ACL_SUCCESS) {
|
| 1219 |
|
| 1220 |
+
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
| 1221 |
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
| 1222 |
return nullptr;
|
| 1223 |
}
|
|
|
|
| 1670 |
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
| 1671 |
|
| 1672 |
if (!ok) {
|
| 1673 |
+
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
|
| 1674 |
node->name, ggml_op_name(node->op));
|
| 1675 |
}
|
| 1676 |
GGML_ASSERT(ok);
|
|
|
|
| 1980 |
ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
| 1981 |
aclInit(nullptr);
|
| 1982 |
if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
|
| 1983 |
+
GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
|
| 1984 |
return nullptr;
|
| 1985 |
}
|
| 1986 |
|
| 1987 |
ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
|
| 1988 |
if (ctx == nullptr) {
|
| 1989 |
+
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
| 1990 |
return nullptr;
|
| 1991 |
}
|
| 1992 |
ggml_cann_set_device(ctx->device);
|
ggml/src/ggml-cuda.cu
CHANGED
|
@@ -58,52 +58,14 @@
|
|
| 58 |
|
| 59 |
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
| 60 |
|
| 61 |
-
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
| 62 |
-
GGML_UNUSED(level);
|
| 63 |
-
GGML_UNUSED(user_data);
|
| 64 |
-
fprintf(stderr, "%s", msg);
|
| 65 |
-
}
|
| 66 |
-
|
| 67 |
-
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
| 68 |
-
void * ggml_cuda_log_user_data = NULL;
|
| 69 |
-
|
| 70 |
-
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
| 71 |
-
ggml_cuda_log_callback = log_callback;
|
| 72 |
-
ggml_cuda_log_user_data = user_data;
|
| 73 |
-
}
|
| 74 |
-
|
| 75 |
-
#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
| 76 |
-
#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
| 77 |
-
#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
| 78 |
-
|
| 79 |
-
GGML_ATTRIBUTE_FORMAT(2, 3)
|
| 80 |
-
static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
|
| 81 |
-
if (ggml_cuda_log_callback != NULL) {
|
| 82 |
-
va_list args;
|
| 83 |
-
va_start(args, format);
|
| 84 |
-
char buffer[128];
|
| 85 |
-
int len = vsnprintf(buffer, 128, format, args);
|
| 86 |
-
if (len < 128) {
|
| 87 |
-
ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
|
| 88 |
-
} else {
|
| 89 |
-
std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
|
| 90 |
-
va_end(args);
|
| 91 |
-
va_start(args, format);
|
| 92 |
-
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
| 93 |
-
ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
|
| 94 |
-
}
|
| 95 |
-
va_end(args);
|
| 96 |
-
}
|
| 97 |
-
}
|
| 98 |
-
|
| 99 |
[[noreturn]]
|
| 100 |
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
| 101 |
int id = -1; // in case cudaGetDevice fails
|
| 102 |
cudaGetDevice(&id);
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
// abort with GGML_ABORT to get a stack trace
|
| 108 |
GGML_ABORT(GGML_CUDA_NAME " error");
|
| 109 |
}
|
|
@@ -168,7 +130,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 168 |
|
| 169 |
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
| 170 |
if (err != cudaSuccess) {
|
| 171 |
-
|
| 172 |
return info;
|
| 173 |
}
|
| 174 |
|
|
@@ -176,16 +138,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 176 |
|
| 177 |
int64_t total_vram = 0;
|
| 178 |
#ifdef GGML_CUDA_FORCE_MMQ
|
| 179 |
-
|
| 180 |
#else
|
| 181 |
-
|
| 182 |
#endif // GGML_CUDA_FORCE_MMQ
|
| 183 |
#ifdef GGML_CUDA_FORCE_CUBLAS
|
| 184 |
-
|
| 185 |
#else
|
| 186 |
-
|
| 187 |
#endif // GGML_CUDA_FORCE_CUBLAS
|
| 188 |
-
|
| 189 |
for (int id = 0; id < info.device_count; ++id) {
|
| 190 |
int device_vmm = 0;
|
| 191 |
|
|
@@ -206,7 +168,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 206 |
|
| 207 |
cudaDeviceProp prop;
|
| 208 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
| 209 |
-
|
| 210 |
|
| 211 |
info.default_tensor_split[id] = total_vram;
|
| 212 |
total_vram += prop.totalGlobalMem;
|
|
@@ -314,7 +276,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
| 314 |
*actual_size = look_ahead_size;
|
| 315 |
pool_size += look_ahead_size;
|
| 316 |
#ifdef DEBUG_CUDA_MALLOC
|
| 317 |
-
|
| 318 |
(uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
|
| 319 |
#endif
|
| 320 |
return ptr;
|
|
@@ -329,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
| 329 |
return;
|
| 330 |
}
|
| 331 |
}
|
| 332 |
-
|
| 333 |
ggml_cuda_set_device(device);
|
| 334 |
CUDA_CHECK(cudaFree(ptr));
|
| 335 |
pool_size -= size;
|
|
@@ -593,7 +555,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
|
| 593 |
if (err != cudaSuccess) {
|
| 594 |
// clear the error
|
| 595 |
cudaGetLastError();
|
| 596 |
-
|
| 597 |
return nullptr;
|
| 598 |
}
|
| 599 |
|
|
@@ -1018,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
| 1018 |
if (err != cudaSuccess) {
|
| 1019 |
// clear the error
|
| 1020 |
cudaGetLastError();
|
| 1021 |
-
|
| 1022 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 1023 |
return nullptr;
|
| 1024 |
}
|
|
@@ -2291,7 +2253,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
| 2291 |
break;
|
| 2292 |
case GGML_OP_MUL_MAT:
|
| 2293 |
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
| 2294 |
-
|
| 2295 |
return false;
|
| 2296 |
} else {
|
| 2297 |
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
|
@@ -2375,7 +2337,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
| 2375 |
|
| 2376 |
cudaError_t err = cudaGetLastError();
|
| 2377 |
if (err != cudaSuccess) {
|
| 2378 |
-
|
| 2379 |
CUDA_CHECK(err);
|
| 2380 |
}
|
| 2381 |
|
|
@@ -2444,7 +2406,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
|
| 2444 |
|
| 2445 |
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
| 2446 |
#ifndef NDEBUG
|
| 2447 |
-
|
| 2448 |
#endif
|
| 2449 |
return false;
|
| 2450 |
}
|
|
@@ -2560,7 +2522,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2560 |
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
| 2561 |
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
| 2562 |
#ifndef NDEBUG
|
| 2563 |
-
|
| 2564 |
#endif
|
| 2565 |
}
|
| 2566 |
}
|
|
@@ -2611,14 +2573,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2611 |
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
| 2612 |
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
| 2613 |
#ifndef NDEBUG
|
| 2614 |
-
|
| 2615 |
#endif
|
| 2616 |
}
|
| 2617 |
|
| 2618 |
if (node->op == GGML_OP_MUL_MAT_ID) {
|
| 2619 |
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
| 2620 |
#ifndef NDEBUG
|
| 2621 |
-
|
| 2622 |
#endif
|
| 2623 |
}
|
| 2624 |
|
|
@@ -2627,7 +2589,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2627 |
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
| 2628 |
use_cuda_graph = false;
|
| 2629 |
#ifndef NDEBUG
|
| 2630 |
-
|
| 2631 |
#endif
|
| 2632 |
}
|
| 2633 |
|
|
@@ -2639,7 +2601,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2639 |
if (!ptr) {
|
| 2640 |
use_cuda_graph = false;
|
| 2641 |
#ifndef NDEBUG
|
| 2642 |
-
|
| 2643 |
#endif
|
| 2644 |
} else {
|
| 2645 |
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
|
@@ -2663,7 +2625,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2663 |
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
| 2664 |
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
| 2665 |
#ifndef NDEBUG
|
| 2666 |
-
|
| 2667 |
#endif
|
| 2668 |
}
|
| 2669 |
}
|
|
@@ -2702,7 +2664,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2702 |
|
| 2703 |
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
| 2704 |
if (!ok) {
|
| 2705 |
-
|
| 2706 |
}
|
| 2707 |
GGML_ASSERT(ok);
|
| 2708 |
}
|
|
@@ -2721,7 +2683,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2721 |
use_cuda_graph = false;
|
| 2722 |
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
| 2723 |
#ifndef NDEBUG
|
| 2724 |
-
|
| 2725 |
#endif
|
| 2726 |
} else {
|
| 2727 |
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
|
@@ -2788,7 +2750,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2788 |
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
| 2789 |
if (stat == cudaErrorGraphExecUpdateFailure) {
|
| 2790 |
#ifndef NDEBUG
|
| 2791 |
-
|
| 2792 |
#endif
|
| 2793 |
// The pre-existing graph exec cannot be updated due to violated constraints
|
| 2794 |
// so instead clear error and re-instantiate
|
|
@@ -2890,7 +2852,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
|
| 2890 |
// clear the error
|
| 2891 |
cudaGetLastError();
|
| 2892 |
|
| 2893 |
-
|
| 2894 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 2895 |
return false;
|
| 2896 |
}
|
|
@@ -3322,17 +3284,11 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
|
|
| 3322 |
return nullptr;
|
| 3323 |
}
|
| 3324 |
|
| 3325 |
-
static void ggml_backend_cuda_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) {
|
| 3326 |
-
GGML_UNUSED(reg);
|
| 3327 |
-
ggml_backend_cuda_log_set_callback(log_callback, user_data);
|
| 3328 |
-
}
|
| 3329 |
-
|
| 3330 |
static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
|
| 3331 |
/* .get_name = */ ggml_backend_cuda_reg_get_name,
|
| 3332 |
/* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
|
| 3333 |
/* .get_device_get = */ ggml_backend_cuda_reg_get_device,
|
| 3334 |
/* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
|
| 3335 |
-
/* .set_log_callback = */ ggml_backend_cuda_reg_set_log_callback,
|
| 3336 |
};
|
| 3337 |
|
| 3338 |
// backend registry
|
|
@@ -3378,13 +3334,13 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|
| 3378 |
|
| 3379 |
ggml_backend_t ggml_backend_cuda_init(int device) {
|
| 3380 |
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
| 3381 |
-
|
| 3382 |
return nullptr;
|
| 3383 |
}
|
| 3384 |
|
| 3385 |
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
| 3386 |
if (ctx == nullptr) {
|
| 3387 |
-
|
| 3388 |
return nullptr;
|
| 3389 |
}
|
| 3390 |
|
|
|
|
| 58 |
|
| 59 |
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
[[noreturn]]
|
| 62 |
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
| 63 |
int id = -1; // in case cudaGetDevice fails
|
| 64 |
cudaGetDevice(&id);
|
| 65 |
|
| 66 |
+
GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
|
| 67 |
+
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
| 68 |
+
GGML_LOG_ERROR(" %s\n", stmt);
|
| 69 |
// abort with GGML_ABORT to get a stack trace
|
| 70 |
GGML_ABORT(GGML_CUDA_NAME " error");
|
| 71 |
}
|
|
|
|
| 130 |
|
| 131 |
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
| 132 |
if (err != cudaSuccess) {
|
| 133 |
+
GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
|
| 134 |
return info;
|
| 135 |
}
|
| 136 |
|
|
|
|
| 138 |
|
| 139 |
int64_t total_vram = 0;
|
| 140 |
#ifdef GGML_CUDA_FORCE_MMQ
|
| 141 |
+
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
| 142 |
#else
|
| 143 |
+
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
| 144 |
#endif // GGML_CUDA_FORCE_MMQ
|
| 145 |
#ifdef GGML_CUDA_FORCE_CUBLAS
|
| 146 |
+
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
|
| 147 |
#else
|
| 148 |
+
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
|
| 149 |
#endif // GGML_CUDA_FORCE_CUBLAS
|
| 150 |
+
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
| 151 |
for (int id = 0; id < info.device_count; ++id) {
|
| 152 |
int device_vmm = 0;
|
| 153 |
|
|
|
|
| 168 |
|
| 169 |
cudaDeviceProp prop;
|
| 170 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
| 171 |
+
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
| 172 |
|
| 173 |
info.default_tensor_split[id] = total_vram;
|
| 174 |
total_vram += prop.totalGlobalMem;
|
|
|
|
| 276 |
*actual_size = look_ahead_size;
|
| 277 |
pool_size += look_ahead_size;
|
| 278 |
#ifdef DEBUG_CUDA_MALLOC
|
| 279 |
+
GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
| 280 |
(uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
|
| 281 |
#endif
|
| 282 |
return ptr;
|
|
|
|
| 291 |
return;
|
| 292 |
}
|
| 293 |
}
|
| 294 |
+
GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
| 295 |
ggml_cuda_set_device(device);
|
| 296 |
CUDA_CHECK(cudaFree(ptr));
|
| 297 |
pool_size -= size;
|
|
|
|
| 555 |
if (err != cudaSuccess) {
|
| 556 |
// clear the error
|
| 557 |
cudaGetLastError();
|
| 558 |
+
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
| 559 |
return nullptr;
|
| 560 |
}
|
| 561 |
|
|
|
|
| 980 |
if (err != cudaSuccess) {
|
| 981 |
// clear the error
|
| 982 |
cudaGetLastError();
|
| 983 |
+
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
| 984 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 985 |
return nullptr;
|
| 986 |
}
|
|
|
|
| 2253 |
break;
|
| 2254 |
case GGML_OP_MUL_MAT:
|
| 2255 |
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
| 2256 |
+
GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
| 2257 |
return false;
|
| 2258 |
} else {
|
| 2259 |
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
|
|
|
| 2337 |
|
| 2338 |
cudaError_t err = cudaGetLastError();
|
| 2339 |
if (err != cudaSuccess) {
|
| 2340 |
+
GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
|
| 2341 |
CUDA_CHECK(err);
|
| 2342 |
}
|
| 2343 |
|
|
|
|
| 2406 |
|
| 2407 |
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
| 2408 |
#ifndef NDEBUG
|
| 2409 |
+
GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
|
| 2410 |
#endif
|
| 2411 |
return false;
|
| 2412 |
}
|
|
|
|
| 2522 |
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
| 2523 |
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
| 2524 |
#ifndef NDEBUG
|
| 2525 |
+
GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
| 2526 |
#endif
|
| 2527 |
}
|
| 2528 |
}
|
|
|
|
| 2573 |
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
| 2574 |
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
| 2575 |
#ifndef NDEBUG
|
| 2576 |
+
GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
| 2577 |
#endif
|
| 2578 |
}
|
| 2579 |
|
| 2580 |
if (node->op == GGML_OP_MUL_MAT_ID) {
|
| 2581 |
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
| 2582 |
#ifndef NDEBUG
|
| 2583 |
+
GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
| 2584 |
#endif
|
| 2585 |
}
|
| 2586 |
|
|
|
|
| 2589 |
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
| 2590 |
use_cuda_graph = false;
|
| 2591 |
#ifndef NDEBUG
|
| 2592 |
+
GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
| 2593 |
#endif
|
| 2594 |
}
|
| 2595 |
|
|
|
|
| 2601 |
if (!ptr) {
|
| 2602 |
use_cuda_graph = false;
|
| 2603 |
#ifndef NDEBUG
|
| 2604 |
+
GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
| 2605 |
#endif
|
| 2606 |
} else {
|
| 2607 |
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
|
|
|
| 2625 |
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
| 2626 |
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
| 2627 |
#ifndef NDEBUG
|
| 2628 |
+
GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
| 2629 |
#endif
|
| 2630 |
}
|
| 2631 |
}
|
|
|
|
| 2664 |
|
| 2665 |
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
| 2666 |
if (!ok) {
|
| 2667 |
+
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
| 2668 |
}
|
| 2669 |
GGML_ASSERT(ok);
|
| 2670 |
}
|
|
|
|
| 2683 |
use_cuda_graph = false;
|
| 2684 |
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
| 2685 |
#ifndef NDEBUG
|
| 2686 |
+
GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
| 2687 |
#endif
|
| 2688 |
} else {
|
| 2689 |
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
|
|
|
| 2750 |
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
| 2751 |
if (stat == cudaErrorGraphExecUpdateFailure) {
|
| 2752 |
#ifndef NDEBUG
|
| 2753 |
+
GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
|
| 2754 |
#endif
|
| 2755 |
// The pre-existing graph exec cannot be updated due to violated constraints
|
| 2756 |
// so instead clear error and re-instantiate
|
|
|
|
| 2852 |
// clear the error
|
| 2853 |
cudaGetLastError();
|
| 2854 |
|
| 2855 |
+
GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
| 2856 |
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 2857 |
return false;
|
| 2858 |
}
|
|
|
|
| 3284 |
return nullptr;
|
| 3285 |
}
|
| 3286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3287 |
static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
|
| 3288 |
/* .get_name = */ ggml_backend_cuda_reg_get_name,
|
| 3289 |
/* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
|
| 3290 |
/* .get_device_get = */ ggml_backend_cuda_reg_get_device,
|
| 3291 |
/* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
|
|
|
|
| 3292 |
};
|
| 3293 |
|
| 3294 |
// backend registry
|
|
|
|
| 3334 |
|
| 3335 |
ggml_backend_t ggml_backend_cuda_init(int device) {
|
| 3336 |
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
| 3337 |
+
GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
| 3338 |
return nullptr;
|
| 3339 |
}
|
| 3340 |
|
| 3341 |
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
| 3342 |
if (ctx == nullptr) {
|
| 3343 |
+
GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
|
| 3344 |
return nullptr;
|
| 3345 |
}
|
| 3346 |
|
ggml/src/ggml-impl.h
CHANGED
|
@@ -33,6 +33,21 @@ extern "C" {
|
|
| 33 |
#endif
|
| 34 |
#endif
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
// bitset
|
| 37 |
|
| 38 |
typedef uint32_t ggml_bitset_t;
|
|
|
|
| 33 |
#endif
|
| 34 |
#endif
|
| 35 |
|
| 36 |
+
//
|
| 37 |
+
// logging
|
| 38 |
+
//
|
| 39 |
+
|
| 40 |
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
| 41 |
+
void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
| 42 |
+
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
| 43 |
+
|
| 44 |
+
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
| 45 |
+
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
| 46 |
+
#define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
| 47 |
+
#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
| 48 |
+
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
| 49 |
+
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
| 50 |
+
|
| 51 |
// bitset
|
| 52 |
|
| 53 |
typedef uint32_t ggml_bitset_t;
|
ggml/src/ggml-metal.m
CHANGED
|
@@ -18,19 +18,6 @@
|
|
| 18 |
// max number of MTLCommandBuffer used to submit a graph for processing
|
| 19 |
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
|
| 20 |
|
| 21 |
-
#ifdef GGML_METAL_NDEBUG
|
| 22 |
-
#define GGML_METAL_LOG(...)
|
| 23 |
-
#define GGML_METAL_LOG_INFO(...)
|
| 24 |
-
#define GGML_METAL_LOG_WARN(...)
|
| 25 |
-
#define GGML_METAL_LOG_ERROR(...)
|
| 26 |
-
#else
|
| 27 |
-
#define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__)
|
| 28 |
-
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
| 29 |
-
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
| 30 |
-
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
| 31 |
-
#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
| 32 |
-
#endif
|
| 33 |
-
|
| 34 |
#define UNUSED(x) (void)(x)
|
| 35 |
|
| 36 |
struct ggml_metal_kernel {
|
|
@@ -230,8 +217,6 @@ struct ggml_backend_metal_context {
|
|
| 230 |
id<MTLDevice> device;
|
| 231 |
id<MTLCommandQueue> queue;
|
| 232 |
|
| 233 |
-
MTLComputePassDescriptor * edesc;
|
| 234 |
-
|
| 235 |
dispatch_queue_t d_queue;
|
| 236 |
|
| 237 |
struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
|
|
@@ -277,51 +262,19 @@ struct ggml_backend_metal_context {
|
|
| 277 |
@implementation GGMLMetalClass
|
| 278 |
@end
|
| 279 |
|
| 280 |
-
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
| 281 |
-
fprintf(stderr, "%s", msg);
|
| 282 |
-
|
| 283 |
-
UNUSED(level);
|
| 284 |
-
UNUSED(user_data);
|
| 285 |
-
}
|
| 286 |
-
|
| 287 |
-
ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
|
| 288 |
-
void * ggml_metal_log_user_data = NULL;
|
| 289 |
-
|
| 290 |
-
GGML_ATTRIBUTE_FORMAT(2, 3)
|
| 291 |
-
static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
| 292 |
-
if (ggml_metal_log_callback != NULL) {
|
| 293 |
-
va_list args;
|
| 294 |
-
va_start(args, format);
|
| 295 |
-
char buffer[128];
|
| 296 |
-
int len = vsnprintf(buffer, 128, format, args);
|
| 297 |
-
if (len < 128) {
|
| 298 |
-
ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
|
| 299 |
-
} else {
|
| 300 |
-
char* buffer2 = malloc(len+1);
|
| 301 |
-
va_end(args);
|
| 302 |
-
va_start(args, format);
|
| 303 |
-
vsnprintf(buffer2, len+1, format, args);
|
| 304 |
-
buffer2[len] = 0;
|
| 305 |
-
ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
|
| 306 |
-
free(buffer2);
|
| 307 |
-
}
|
| 308 |
-
va_end(args);
|
| 309 |
-
}
|
| 310 |
-
}
|
| 311 |
-
|
| 312 |
static void * ggml_metal_host_malloc(size_t n) {
|
| 313 |
void * data = NULL;
|
| 314 |
|
| 315 |
#if TARGET_OS_OSX
|
| 316 |
kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
|
| 317 |
if (err != KERN_SUCCESS) {
|
| 318 |
-
|
| 319 |
return NULL;
|
| 320 |
}
|
| 321 |
#else
|
| 322 |
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
| 323 |
if (result != 0) {
|
| 324 |
-
|
| 325 |
return NULL;
|
| 326 |
}
|
| 327 |
#endif
|
|
@@ -330,27 +283,25 @@ static void * ggml_metal_host_malloc(size_t n) {
|
|
| 330 |
}
|
| 331 |
|
| 332 |
static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
| 333 |
-
|
| 334 |
|
| 335 |
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
|
| 336 |
// Show all the Metal device instances in the system
|
| 337 |
NSArray * devices = MTLCopyAllDevices();
|
| 338 |
for (id<MTLDevice> device in devices) {
|
| 339 |
-
|
| 340 |
}
|
| 341 |
[devices release]; // since it was created by a *Copy* C method
|
| 342 |
#endif
|
| 343 |
|
| 344 |
// Pick and show default Metal device
|
| 345 |
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
| 346 |
-
|
| 347 |
|
| 348 |
// Configure context
|
| 349 |
struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
|
| 350 |
ctx->device = device;
|
| 351 |
ctx->queue = [ctx->device newCommandQueue];
|
| 352 |
-
ctx->edesc = MTLComputePassDescriptor.computePassDescriptor;
|
| 353 |
-
ctx->edesc.dispatchType = MTLDispatchTypeSerial;
|
| 354 |
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
| 355 |
|
| 356 |
id<MTLLibrary> metal_library;
|
|
@@ -381,28 +332,28 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|
| 381 |
if (try_metallib && path_lib != nil) {
|
| 382 |
// pre-compiled library found
|
| 383 |
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
| 384 |
-
|
| 385 |
|
| 386 |
metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
|
| 387 |
if (error) {
|
| 388 |
-
|
| 389 |
return NULL;
|
| 390 |
}
|
| 391 |
} else {
|
| 392 |
#if GGML_METAL_EMBED_LIBRARY
|
| 393 |
-
|
| 394 |
|
| 395 |
extern const char ggml_metallib_start[];
|
| 396 |
extern const char ggml_metallib_end[];
|
| 397 |
|
| 398 |
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
| 399 |
#else
|
| 400 |
-
|
| 401 |
|
| 402 |
NSString * path_source;
|
| 403 |
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
| 404 |
|
| 405 |
-
|
| 406 |
|
| 407 |
if (path_resource) {
|
| 408 |
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
|
@@ -411,15 +362,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|
| 411 |
}
|
| 412 |
|
| 413 |
if (path_source == nil) {
|
| 414 |
-
|
| 415 |
path_source = @"ggml-metal.metal";
|
| 416 |
}
|
| 417 |
|
| 418 |
-
|
| 419 |
|
| 420 |
NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
| 421 |
if (error) {
|
| 422 |
-
|
| 423 |
return NULL;
|
| 424 |
}
|
| 425 |
#endif // GGML_METAL_EMBED_LIBRARY
|
|
@@ -435,7 +386,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|
| 435 |
|
| 436 |
metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
| 437 |
if (error) {
|
| 438 |
-
|
| 439 |
return NULL;
|
| 440 |
}
|
| 441 |
}
|
|
@@ -443,7 +394,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|
| 443 |
}
|
| 444 |
|
| 445 |
// print MTL GPU family:
|
| 446 |
-
|
| 447 |
|
| 448 |
const NSInteger MTLGPUFamilyMetal3 = 5001;
|
| 449 |
|
|
@@ -453,21 +404,21 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|
| 453 |
{
|
| 454 |
for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
|
| 455 |
if ([ctx->device supportsFamily:i]) {
|
| 456 |
-
|
| 457 |
break;
|
| 458 |
}
|
| 459 |
}
|
| 460 |
|
| 461 |
for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
|
| 462 |
if ([ctx->device supportsFamily:i]) {
|
| 463 |
-
|
| 464 |
break;
|
| 465 |
}
|
| 466 |
}
|
| 467 |
|
| 468 |
for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
|
| 469 |
if ([ctx->device supportsFamily:i]) {
|
| 470 |
-
|
| 471 |
break;
|
| 472 |
}
|
| 473 |
}
|
|
@@ -478,9 +429,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|
| 478 |
|
| 479 |
ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7];
|
| 480 |
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
|
| 485 |
ctx->capture_next_compute = false;
|
| 486 |
ctx->capture_started = false;
|
|
@@ -494,13 +445,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|
| 494 |
|
| 495 |
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
| 496 |
if (@available(macOS 10.12, iOS 16.0, *)) {
|
| 497 |
-
|
| 498 |
}
|
| 499 |
#elif TARGET_OS_OSX
|
| 500 |
if (ctx->device.maxTransferRate != 0) {
|
| 501 |
-
|
| 502 |
} else {
|
| 503 |
-
|
| 504 |
}
|
| 505 |
#endif
|
| 506 |
|
|
@@ -513,7 +464,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|
| 513 |
}
|
| 514 |
|
| 515 |
/*
|
| 516 |
-
|
| 517 |
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
|
| 518 |
(int) kernel->pipeline.threadExecutionWidth); \
|
| 519 |
*/
|
|
@@ -524,12 +475,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|
| 524 |
kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
|
| 525 |
[metal_function release]; \
|
| 526 |
if (error) { \
|
| 527 |
-
|
| 528 |
[metal_library release]; \
|
| 529 |
return NULL; \
|
| 530 |
} \
|
| 531 |
} else { \
|
| 532 |
-
|
| 533 |
}
|
| 534 |
|
| 535 |
// simd_sum and simd_max requires MTLGPUFamilyApple7
|
|
@@ -726,7 +677,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
|
| 726 |
}
|
| 727 |
|
| 728 |
static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
|
| 729 |
-
|
| 730 |
|
| 731 |
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
|
| 732 |
[ctx->kernels[i].pipeline release];
|
|
@@ -764,7 +715,7 @@ struct ggml_backend_metal_buffer_context {
|
|
| 764 |
// Metal buffer based on the host memory pointer
|
| 765 |
//
|
| 766 |
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
|
| 767 |
-
//
|
| 768 |
|
| 769 |
const int64_t tsize = ggml_nbytes(t);
|
| 770 |
|
|
@@ -776,17 +727,17 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
|
|
| 776 |
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
| 777 |
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
| 778 |
|
| 779 |
-
//
|
| 780 |
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
| 781 |
*offs = (size_t) ioffs;
|
| 782 |
|
| 783 |
-
//
|
| 784 |
|
| 785 |
return buf_ctx->buffers[i].metal;
|
| 786 |
}
|
| 787 |
}
|
| 788 |
|
| 789 |
-
|
| 790 |
|
| 791 |
return nil;
|
| 792 |
}
|
|
@@ -918,7 +869,7 @@ static void ggml_metal_encode_node(
|
|
| 918 |
|
| 919 |
struct ggml_tensor * node = ggml_graph_node(gf, idx);
|
| 920 |
|
| 921 |
-
//
|
| 922 |
|
| 923 |
struct ggml_tensor * src0 = node->src[0];
|
| 924 |
struct ggml_tensor * src1 = node->src[1];
|
|
@@ -944,7 +895,7 @@ static void ggml_metal_encode_node(
|
|
| 944 |
}
|
| 945 |
|
| 946 |
if (!ggml_metal_supports_op(ctx, dst)) {
|
| 947 |
-
|
| 948 |
GGML_ABORT("unsupported op");
|
| 949 |
}
|
| 950 |
|
|
@@ -1002,17 +953,17 @@ static void ggml_metal_encode_node(
|
|
| 1002 |
id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
|
| 1003 |
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
|
| 1004 |
|
| 1005 |
-
//
|
| 1006 |
//if (src0) {
|
| 1007 |
-
//
|
| 1008 |
// ggml_is_contiguous(src0), src0->name);
|
| 1009 |
//}
|
| 1010 |
//if (src1) {
|
| 1011 |
-
//
|
| 1012 |
// ggml_is_contiguous(src1), src1->name);
|
| 1013 |
//}
|
| 1014 |
//if (dst) {
|
| 1015 |
-
//
|
| 1016 |
// dst->name);
|
| 1017 |
//}
|
| 1018 |
|
|
@@ -1404,7 +1355,7 @@ static void ggml_metal_encode_node(
|
|
| 1404 |
} break;
|
| 1405 |
default:
|
| 1406 |
{
|
| 1407 |
-
|
| 1408 |
GGML_ABORT("fatal error");
|
| 1409 |
}
|
| 1410 |
} break;
|
|
@@ -1956,7 +1907,7 @@ static void ggml_metal_encode_node(
|
|
| 1956 |
} break;
|
| 1957 |
default:
|
| 1958 |
{
|
| 1959 |
-
|
| 1960 |
GGML_ABORT("not implemented");
|
| 1961 |
}
|
| 1962 |
};
|
|
@@ -2252,7 +2203,7 @@ static void ggml_metal_encode_node(
|
|
| 2252 |
} break;
|
| 2253 |
default:
|
| 2254 |
{
|
| 2255 |
-
|
| 2256 |
GGML_ABORT("not implemented");
|
| 2257 |
}
|
| 2258 |
};
|
|
@@ -2821,8 +2772,8 @@ static void ggml_metal_encode_node(
|
|
| 2821 |
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
| 2822 |
default:
|
| 2823 |
{
|
| 2824 |
-
|
| 2825 |
-
|
| 2826 |
GGML_ABORT("add template specialization for this size");
|
| 2827 |
}
|
| 2828 |
}
|
|
@@ -2834,8 +2785,8 @@ static void ggml_metal_encode_node(
|
|
| 2834 |
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
| 2835 |
default:
|
| 2836 |
{
|
| 2837 |
-
|
| 2838 |
-
|
| 2839 |
GGML_ABORT("add template specialization for this size");
|
| 2840 |
}
|
| 2841 |
}
|
|
@@ -2996,7 +2947,7 @@ static void ggml_metal_encode_node(
|
|
| 2996 |
} break;
|
| 2997 |
default:
|
| 2998 |
{
|
| 2999 |
-
|
| 3000 |
GGML_ABORT("fatal error");
|
| 3001 |
}
|
| 3002 |
}
|
|
@@ -3041,7 +2992,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
| 3041 |
|
| 3042 |
NSError * error = nil;
|
| 3043 |
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
| 3044 |
-
|
| 3045 |
} else {
|
| 3046 |
[ctx->capture_scope beginScope];
|
| 3047 |
ctx->capture_started = true;
|
|
@@ -3060,7 +3011,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
| 3060 |
const int n_nodes_per_cb = ctx->n_nodes_per_cb;
|
| 3061 |
|
| 3062 |
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[cb_idx];
|
| 3063 |
-
id<MTLComputeCommandEncoder> encoder = [command_buffer
|
| 3064 |
|
| 3065 |
int node_start = 0;
|
| 3066 |
int node_end = n_nodes_0;
|
|
@@ -3122,9 +3073,9 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
| 3122 |
|
| 3123 |
MTLCommandBufferStatus status = [command_buffer status];
|
| 3124 |
if (status != MTLCommandBufferStatusCompleted) {
|
| 3125 |
-
|
| 3126 |
if (status == MTLCommandBufferStatusError) {
|
| 3127 |
-
|
| 3128 |
}
|
| 3129 |
|
| 3130 |
return GGML_STATUS_FAILED;
|
|
@@ -3137,9 +3088,9 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
| 3137 |
|
| 3138 |
MTLCommandBufferStatus status = [command_buffer status];
|
| 3139 |
if (status != MTLCommandBufferStatusCompleted) {
|
| 3140 |
-
|
| 3141 |
if (status == MTLCommandBufferStatusError) {
|
| 3142 |
-
|
| 3143 |
}
|
| 3144 |
|
| 3145 |
return GGML_STATUS_FAILED;
|
|
@@ -3156,7 +3107,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
| 3156 |
}
|
| 3157 |
|
| 3158 |
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
| 3159 |
-
|
| 3160 |
return GGML_STATUS_ABORTED;
|
| 3161 |
}
|
| 3162 |
|
|
@@ -3285,17 +3236,17 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
|
|
| 3285 |
#ifndef GGML_METAL_NDEBUG
|
| 3286 |
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
| 3287 |
if (@available(macOS 10.12, iOS 16.0, *)) {
|
| 3288 |
-
|
| 3289 |
__func__,
|
| 3290 |
size_aligned / 1024.0 / 1024.0,
|
| 3291 |
device.currentAllocatedSize / 1024.0 / 1024.0,
|
| 3292 |
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
| 3293 |
|
| 3294 |
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
| 3295 |
-
|
| 3296 |
}
|
| 3297 |
} else {
|
| 3298 |
-
|
| 3299 |
__func__,
|
| 3300 |
size_aligned / 1024.0 / 1024.0,
|
| 3301 |
device.currentAllocatedSize / 1024.0 / 1024.0);
|
|
@@ -3337,7 +3288,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|
| 3337 |
}
|
| 3338 |
|
| 3339 |
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
| 3340 |
-
|
| 3341 |
free(ctx);
|
| 3342 |
ggml_backend_metal_free_device();
|
| 3343 |
return NULL;
|
|
@@ -3422,7 +3373,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
|
| 3422 |
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
| 3423 |
|
| 3424 |
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
| 3425 |
-
|
| 3426 |
return false;
|
| 3427 |
}
|
| 3428 |
}
|
|
@@ -3448,7 +3399,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
|
| 3448 |
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
| 3449 |
|
| 3450 |
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
| 3451 |
-
|
| 3452 |
return false;
|
| 3453 |
}
|
| 3454 |
}
|
|
@@ -3456,7 +3407,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
|
| 3456 |
ggml_backend_metal_log_allocated_size(device, size_step_aligned);
|
| 3457 |
|
| 3458 |
if (i + size_step < size) {
|
| 3459 |
-
|
| 3460 |
}
|
| 3461 |
|
| 3462 |
++ctx->n_buffers;
|
|
@@ -3513,7 +3464,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|
| 3513 |
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
|
| 3514 |
|
| 3515 |
if (ctx->n_cb > 2) {
|
| 3516 |
-
|
| 3517 |
}
|
| 3518 |
}
|
| 3519 |
|
|
@@ -3543,11 +3494,6 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|
| 3543 |
/* .event_wait = */ NULL,
|
| 3544 |
};
|
| 3545 |
|
| 3546 |
-
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
| 3547 |
-
ggml_metal_log_callback = log_callback;
|
| 3548 |
-
ggml_metal_log_user_data = user_data;
|
| 3549 |
-
}
|
| 3550 |
-
|
| 3551 |
static ggml_guid_t ggml_backend_metal_guid(void) {
|
| 3552 |
static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 };
|
| 3553 |
return &guid;
|
|
@@ -3556,7 +3502,7 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
|
|
| 3556 |
ggml_backend_t ggml_backend_metal_init(void) {
|
| 3557 |
struct ggml_backend_metal_context * ctx = ggml_metal_init();
|
| 3558 |
if (ctx == NULL) {
|
| 3559 |
-
|
| 3560 |
return NULL;
|
| 3561 |
}
|
| 3562 |
|
|
|
|
| 18 |
// max number of MTLCommandBuffer used to submit a graph for processing
|
| 19 |
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
#define UNUSED(x) (void)(x)
|
| 22 |
|
| 23 |
struct ggml_metal_kernel {
|
|
|
|
| 217 |
id<MTLDevice> device;
|
| 218 |
id<MTLCommandQueue> queue;
|
| 219 |
|
|
|
|
|
|
|
| 220 |
dispatch_queue_t d_queue;
|
| 221 |
|
| 222 |
struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
|
|
|
|
| 262 |
@implementation GGMLMetalClass
|
| 263 |
@end
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
static void * ggml_metal_host_malloc(size_t n) {
|
| 266 |
void * data = NULL;
|
| 267 |
|
| 268 |
#if TARGET_OS_OSX
|
| 269 |
kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
|
| 270 |
if (err != KERN_SUCCESS) {
|
| 271 |
+
GGML_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
|
| 272 |
return NULL;
|
| 273 |
}
|
| 274 |
#else
|
| 275 |
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
| 276 |
if (result != 0) {
|
| 277 |
+
GGML_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
|
| 278 |
return NULL;
|
| 279 |
}
|
| 280 |
#endif
|
|
|
|
| 283 |
}
|
| 284 |
|
| 285 |
static struct ggml_backend_metal_context * ggml_metal_init(void) {
|
| 286 |
+
GGML_LOG_INFO("%s: allocating\n", __func__);
|
| 287 |
|
| 288 |
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
|
| 289 |
// Show all the Metal device instances in the system
|
| 290 |
NSArray * devices = MTLCopyAllDevices();
|
| 291 |
for (id<MTLDevice> device in devices) {
|
| 292 |
+
GGML_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
|
| 293 |
}
|
| 294 |
[devices release]; // since it was created by a *Copy* C method
|
| 295 |
#endif
|
| 296 |
|
| 297 |
// Pick and show default Metal device
|
| 298 |
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
| 299 |
+
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
| 300 |
|
| 301 |
// Configure context
|
| 302 |
struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
|
| 303 |
ctx->device = device;
|
| 304 |
ctx->queue = [ctx->device newCommandQueue];
|
|
|
|
|
|
|
| 305 |
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
| 306 |
|
| 307 |
id<MTLLibrary> metal_library;
|
|
|
|
| 332 |
if (try_metallib && path_lib != nil) {
|
| 333 |
// pre-compiled library found
|
| 334 |
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
| 335 |
+
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
| 336 |
|
| 337 |
metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
|
| 338 |
if (error) {
|
| 339 |
+
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
| 340 |
return NULL;
|
| 341 |
}
|
| 342 |
} else {
|
| 343 |
#if GGML_METAL_EMBED_LIBRARY
|
| 344 |
+
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
| 345 |
|
| 346 |
extern const char ggml_metallib_start[];
|
| 347 |
extern const char ggml_metallib_end[];
|
| 348 |
|
| 349 |
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
| 350 |
#else
|
| 351 |
+
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
| 352 |
|
| 353 |
NSString * path_source;
|
| 354 |
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
| 355 |
|
| 356 |
+
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
| 357 |
|
| 358 |
if (path_resource) {
|
| 359 |
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
|
|
|
| 362 |
}
|
| 363 |
|
| 364 |
if (path_source == nil) {
|
| 365 |
+
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
| 366 |
path_source = @"ggml-metal.metal";
|
| 367 |
}
|
| 368 |
|
| 369 |
+
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
| 370 |
|
| 371 |
NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
| 372 |
if (error) {
|
| 373 |
+
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
| 374 |
return NULL;
|
| 375 |
}
|
| 376 |
#endif // GGML_METAL_EMBED_LIBRARY
|
|
|
|
| 386 |
|
| 387 |
metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
| 388 |
if (error) {
|
| 389 |
+
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
| 390 |
return NULL;
|
| 391 |
}
|
| 392 |
}
|
|
|
|
| 394 |
}
|
| 395 |
|
| 396 |
// print MTL GPU family:
|
| 397 |
+
GGML_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
|
| 398 |
|
| 399 |
const NSInteger MTLGPUFamilyMetal3 = 5001;
|
| 400 |
|
|
|
|
| 404 |
{
|
| 405 |
for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
|
| 406 |
if ([ctx->device supportsFamily:i]) {
|
| 407 |
+
GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
|
| 408 |
break;
|
| 409 |
}
|
| 410 |
}
|
| 411 |
|
| 412 |
for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
|
| 413 |
if ([ctx->device supportsFamily:i]) {
|
| 414 |
+
GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
|
| 415 |
break;
|
| 416 |
}
|
| 417 |
}
|
| 418 |
|
| 419 |
for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
|
| 420 |
if ([ctx->device supportsFamily:i]) {
|
| 421 |
+
GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
|
| 422 |
break;
|
| 423 |
}
|
| 424 |
}
|
|
|
|
| 429 |
|
| 430 |
ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7];
|
| 431 |
|
| 432 |
+
GGML_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false");
|
| 433 |
+
GGML_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
|
| 434 |
+
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
| 435 |
|
| 436 |
ctx->capture_next_compute = false;
|
| 437 |
ctx->capture_started = false;
|
|
|
|
| 445 |
|
| 446 |
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
| 447 |
if (@available(macOS 10.12, iOS 16.0, *)) {
|
| 448 |
+
GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
| 449 |
}
|
| 450 |
#elif TARGET_OS_OSX
|
| 451 |
if (ctx->device.maxTransferRate != 0) {
|
| 452 |
+
GGML_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
|
| 453 |
} else {
|
| 454 |
+
GGML_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
| 455 |
}
|
| 456 |
#endif
|
| 457 |
|
|
|
|
| 464 |
}
|
| 465 |
|
| 466 |
/*
|
| 467 |
+
GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
|
| 468 |
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
|
| 469 |
(int) kernel->pipeline.threadExecutionWidth); \
|
| 470 |
*/
|
|
|
|
| 475 |
kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
|
| 476 |
[metal_function release]; \
|
| 477 |
if (error) { \
|
| 478 |
+
GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
| 479 |
[metal_library release]; \
|
| 480 |
return NULL; \
|
| 481 |
} \
|
| 482 |
} else { \
|
| 483 |
+
GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
|
| 484 |
}
|
| 485 |
|
| 486 |
// simd_sum and simd_max requires MTLGPUFamilyApple7
|
|
|
|
| 677 |
}
|
| 678 |
|
| 679 |
static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
|
| 680 |
+
GGML_LOG_INFO("%s: deallocating\n", __func__);
|
| 681 |
|
| 682 |
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
|
| 683 |
[ctx->kernels[i].pipeline release];
|
|
|
|
| 715 |
// Metal buffer based on the host memory pointer
|
| 716 |
//
|
| 717 |
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
|
| 718 |
+
//GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
| 719 |
|
| 720 |
const int64_t tsize = ggml_nbytes(t);
|
| 721 |
|
|
|
|
| 727 |
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
| 728 |
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
| 729 |
|
| 730 |
+
//GGML_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
|
| 731 |
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
| 732 |
*offs = (size_t) ioffs;
|
| 733 |
|
| 734 |
+
//GGML_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
|
| 735 |
|
| 736 |
return buf_ctx->buffers[i].metal;
|
| 737 |
}
|
| 738 |
}
|
| 739 |
|
| 740 |
+
GGML_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
| 741 |
|
| 742 |
return nil;
|
| 743 |
}
|
|
|
|
| 869 |
|
| 870 |
struct ggml_tensor * node = ggml_graph_node(gf, idx);
|
| 871 |
|
| 872 |
+
//GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
|
| 873 |
|
| 874 |
struct ggml_tensor * src0 = node->src[0];
|
| 875 |
struct ggml_tensor * src1 = node->src[1];
|
|
|
|
| 895 |
}
|
| 896 |
|
| 897 |
if (!ggml_metal_supports_op(ctx, dst)) {
|
| 898 |
+
GGML_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
|
| 899 |
GGML_ABORT("unsupported op");
|
| 900 |
}
|
| 901 |
|
|
|
|
| 953 |
id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
|
| 954 |
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
|
| 955 |
|
| 956 |
+
//GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
| 957 |
//if (src0) {
|
| 958 |
+
// GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
|
| 959 |
// ggml_is_contiguous(src0), src0->name);
|
| 960 |
//}
|
| 961 |
//if (src1) {
|
| 962 |
+
// GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
|
| 963 |
// ggml_is_contiguous(src1), src1->name);
|
| 964 |
//}
|
| 965 |
//if (dst) {
|
| 966 |
+
// GGML_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
|
| 967 |
// dst->name);
|
| 968 |
//}
|
| 969 |
|
|
|
|
| 1355 |
} break;
|
| 1356 |
default:
|
| 1357 |
{
|
| 1358 |
+
GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
|
| 1359 |
GGML_ABORT("fatal error");
|
| 1360 |
}
|
| 1361 |
} break;
|
|
|
|
| 1907 |
} break;
|
| 1908 |
default:
|
| 1909 |
{
|
| 1910 |
+
GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
| 1911 |
GGML_ABORT("not implemented");
|
| 1912 |
}
|
| 1913 |
};
|
|
|
|
| 2203 |
} break;
|
| 2204 |
default:
|
| 2205 |
{
|
| 2206 |
+
GGML_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
| 2207 |
GGML_ABORT("not implemented");
|
| 2208 |
}
|
| 2209 |
};
|
|
|
|
| 2772 |
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
| 2773 |
default:
|
| 2774 |
{
|
| 2775 |
+
GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
|
| 2776 |
+
GGML_LOG_ERROR("add template specialization for this size\n");
|
| 2777 |
GGML_ABORT("add template specialization for this size");
|
| 2778 |
}
|
| 2779 |
}
|
|
|
|
| 2785 |
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
| 2786 |
default:
|
| 2787 |
{
|
| 2788 |
+
GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
|
| 2789 |
+
GGML_LOG_ERROR("add template specialization for this size\n");
|
| 2790 |
GGML_ABORT("add template specialization for this size");
|
| 2791 |
}
|
| 2792 |
}
|
|
|
|
| 2947 |
} break;
|
| 2948 |
default:
|
| 2949 |
{
|
| 2950 |
+
GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
|
| 2951 |
GGML_ABORT("fatal error");
|
| 2952 |
}
|
| 2953 |
}
|
|
|
|
| 2992 |
|
| 2993 |
NSError * error = nil;
|
| 2994 |
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
| 2995 |
+
GGML_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
|
| 2996 |
} else {
|
| 2997 |
[ctx->capture_scope beginScope];
|
| 2998 |
ctx->capture_started = true;
|
|
|
|
| 3011 |
const int n_nodes_per_cb = ctx->n_nodes_per_cb;
|
| 3012 |
|
| 3013 |
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[cb_idx];
|
| 3014 |
+
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
|
| 3015 |
|
| 3016 |
int node_start = 0;
|
| 3017 |
int node_end = n_nodes_0;
|
|
|
|
| 3073 |
|
| 3074 |
MTLCommandBufferStatus status = [command_buffer status];
|
| 3075 |
if (status != MTLCommandBufferStatusCompleted) {
|
| 3076 |
+
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
|
| 3077 |
if (status == MTLCommandBufferStatusError) {
|
| 3078 |
+
GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
|
| 3079 |
}
|
| 3080 |
|
| 3081 |
return GGML_STATUS_FAILED;
|
|
|
|
| 3088 |
|
| 3089 |
MTLCommandBufferStatus status = [command_buffer status];
|
| 3090 |
if (status != MTLCommandBufferStatusCompleted) {
|
| 3091 |
+
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
| 3092 |
if (status == MTLCommandBufferStatusError) {
|
| 3093 |
+
GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
|
| 3094 |
}
|
| 3095 |
|
| 3096 |
return GGML_STATUS_FAILED;
|
|
|
|
| 3107 |
}
|
| 3108 |
|
| 3109 |
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
| 3110 |
+
GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
| 3111 |
return GGML_STATUS_ABORTED;
|
| 3112 |
}
|
| 3113 |
|
|
|
|
| 3236 |
#ifndef GGML_METAL_NDEBUG
|
| 3237 |
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
| 3238 |
if (@available(macOS 10.12, iOS 16.0, *)) {
|
| 3239 |
+
GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
|
| 3240 |
__func__,
|
| 3241 |
size_aligned / 1024.0 / 1024.0,
|
| 3242 |
device.currentAllocatedSize / 1024.0 / 1024.0,
|
| 3243 |
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
| 3244 |
|
| 3245 |
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
| 3246 |
+
GGML_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
| 3247 |
}
|
| 3248 |
} else {
|
| 3249 |
+
GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
|
| 3250 |
__func__,
|
| 3251 |
size_aligned / 1024.0 / 1024.0,
|
| 3252 |
device.currentAllocatedSize / 1024.0 / 1024.0);
|
|
|
|
| 3288 |
}
|
| 3289 |
|
| 3290 |
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
| 3291 |
+
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
| 3292 |
free(ctx);
|
| 3293 |
ggml_backend_metal_free_device();
|
| 3294 |
return NULL;
|
|
|
|
| 3373 |
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
| 3374 |
|
| 3375 |
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
| 3376 |
+
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
| 3377 |
return false;
|
| 3378 |
}
|
| 3379 |
}
|
|
|
|
| 3399 |
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
| 3400 |
|
| 3401 |
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
| 3402 |
+
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
| 3403 |
return false;
|
| 3404 |
}
|
| 3405 |
}
|
|
|
|
| 3407 |
ggml_backend_metal_log_allocated_size(device, size_step_aligned);
|
| 3408 |
|
| 3409 |
if (i + size_step < size) {
|
| 3410 |
+
GGML_LOG_INFO("\n");
|
| 3411 |
}
|
| 3412 |
|
| 3413 |
++ctx->n_buffers;
|
|
|
|
| 3464 |
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
|
| 3465 |
|
| 3466 |
if (ctx->n_cb > 2) {
|
| 3467 |
+
GGML_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb);
|
| 3468 |
}
|
| 3469 |
}
|
| 3470 |
|
|
|
|
| 3494 |
/* .event_wait = */ NULL,
|
| 3495 |
};
|
| 3496 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3497 |
static ggml_guid_t ggml_backend_metal_guid(void) {
|
| 3498 |
static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 };
|
| 3499 |
return &guid;
|
|
|
|
| 3502 |
ggml_backend_t ggml_backend_metal_init(void) {
|
| 3503 |
struct ggml_backend_metal_context * ctx = ggml_metal_init();
|
| 3504 |
if (ctx == NULL) {
|
| 3505 |
+
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
| 3506 |
return NULL;
|
| 3507 |
}
|
| 3508 |
|
ggml/src/ggml.c
CHANGED
|
@@ -319,26 +319,63 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
|
| 319 |
// logging
|
| 320 |
//
|
| 321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
#if (GGML_DEBUG >= 1)
|
| 323 |
-
#define GGML_PRINT_DEBUG(...)
|
| 324 |
#else
|
| 325 |
#define GGML_PRINT_DEBUG(...)
|
| 326 |
#endif
|
| 327 |
|
| 328 |
#if (GGML_DEBUG >= 5)
|
| 329 |
-
#define GGML_PRINT_DEBUG_5(...)
|
| 330 |
#else
|
| 331 |
#define GGML_PRINT_DEBUG_5(...)
|
| 332 |
#endif
|
| 333 |
|
| 334 |
#if (GGML_DEBUG >= 10)
|
| 335 |
-
#define GGML_PRINT_DEBUG_10(...)
|
| 336 |
#else
|
| 337 |
#define GGML_PRINT_DEBUG_10(...)
|
| 338 |
#endif
|
| 339 |
|
| 340 |
-
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
| 341 |
-
|
| 342 |
//
|
| 343 |
// end of logging block
|
| 344 |
//
|
|
@@ -355,7 +392,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
|
| 355 |
#else
|
| 356 |
inline static void * ggml_aligned_malloc(size_t size) {
|
| 357 |
if (size == 0) {
|
| 358 |
-
|
| 359 |
return NULL;
|
| 360 |
}
|
| 361 |
void * aligned_memory = NULL;
|
|
@@ -377,7 +414,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
| 377 |
error_desc = "insufficient memory";
|
| 378 |
break;
|
| 379 |
}
|
| 380 |
-
|
| 381 |
GGML_ABORT("fatal error");
|
| 382 |
return NULL;
|
| 383 |
}
|
|
@@ -393,12 +430,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
| 393 |
|
| 394 |
inline static void * ggml_malloc(size_t size) {
|
| 395 |
if (size == 0) {
|
| 396 |
-
|
| 397 |
return NULL;
|
| 398 |
}
|
| 399 |
void * result = malloc(size);
|
| 400 |
if (result == NULL) {
|
| 401 |
-
|
| 402 |
GGML_ABORT("fatal error");
|
| 403 |
}
|
| 404 |
return result;
|
|
@@ -407,12 +444,12 @@ inline static void * ggml_malloc(size_t size) {
|
|
| 407 |
// calloc
|
| 408 |
inline static void * ggml_calloc(size_t num, size_t size) {
|
| 409 |
if (num == 0 || size == 0) {
|
| 410 |
-
|
| 411 |
return NULL;
|
| 412 |
}
|
| 413 |
void * result = calloc(num, size);
|
| 414 |
if (result == NULL) {
|
| 415 |
-
|
| 416 |
GGML_ABORT("fatal error");
|
| 417 |
}
|
| 418 |
return result;
|
|
@@ -3349,7 +3386,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
|
| 3349 |
if (fptr != NULL) {
|
| 3350 |
char buf[42];
|
| 3351 |
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
|
| 3352 |
-
|
| 3353 |
}
|
| 3354 |
fclose(fptr);
|
| 3355 |
}
|
|
@@ -3367,21 +3404,21 @@ bool ggml_is_numa(void) {
|
|
| 3367 |
////////////////////////////////////////////////////////////////////////////////
|
| 3368 |
|
| 3369 |
void ggml_print_object(const struct ggml_object * obj) {
|
| 3370 |
-
|
| 3371 |
obj->type, obj->offs, obj->size, (const void *) obj->next);
|
| 3372 |
}
|
| 3373 |
|
| 3374 |
void ggml_print_objects(const struct ggml_context * ctx) {
|
| 3375 |
struct ggml_object * obj = ctx->objects_begin;
|
| 3376 |
|
| 3377 |
-
|
| 3378 |
|
| 3379 |
while (obj != NULL) {
|
| 3380 |
ggml_print_object(obj);
|
| 3381 |
obj = obj->next;
|
| 3382 |
}
|
| 3383 |
|
| 3384 |
-
|
| 3385 |
}
|
| 3386 |
|
| 3387 |
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
|
@@ -3964,7 +4001,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
|
| 3964 |
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
| 3965 |
|
| 3966 |
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
| 3967 |
-
|
| 3968 |
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
|
| 3969 |
assert(false);
|
| 3970 |
return NULL;
|
|
@@ -4028,7 +4065,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
| 4028 |
if (ctx->scratch.data != NULL) {
|
| 4029 |
// allocate tensor data in the scratch buffer
|
| 4030 |
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
| 4031 |
-
|
| 4032 |
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
| 4033 |
assert(false);
|
| 4034 |
return NULL;
|
|
@@ -20136,7 +20173,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
| 20136 |
}
|
| 20137 |
#else
|
| 20138 |
if (n_threads > threadpool->n_threads_max) {
|
| 20139 |
-
|
| 20140 |
n_threads = threadpool->n_threads_max;
|
| 20141 |
}
|
| 20142 |
|
|
@@ -20675,30 +20712,30 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
| 20675 |
}
|
| 20676 |
|
| 20677 |
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
| 20678 |
-
|
| 20679 |
|
| 20680 |
-
|
| 20681 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 20682 |
struct ggml_tensor * node = cgraph->nodes[i];
|
| 20683 |
|
| 20684 |
-
|
| 20685 |
i,
|
| 20686 |
node->ne[0], node->ne[1], node->ne[2],
|
| 20687 |
ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ");
|
| 20688 |
}
|
| 20689 |
|
| 20690 |
-
|
| 20691 |
for (int i = 0; i < cgraph->n_leafs; i++) {
|
| 20692 |
struct ggml_tensor * node = cgraph->leafs[i];
|
| 20693 |
|
| 20694 |
-
|
| 20695 |
i,
|
| 20696 |
node->ne[0], node->ne[1],
|
| 20697 |
ggml_op_name(node->op),
|
| 20698 |
ggml_get_name(node));
|
| 20699 |
}
|
| 20700 |
|
| 20701 |
-
|
| 20702 |
}
|
| 20703 |
|
| 20704 |
// check if node is part of the graph
|
|
@@ -20869,7 +20906,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
| 20869 |
|
| 20870 |
fclose(fp);
|
| 20871 |
|
| 20872 |
-
|
| 20873 |
}
|
| 20874 |
|
| 20875 |
////////////////////////////////////////////////////////////////////////////////
|
|
@@ -23364,4 +23401,9 @@ int ggml_cpu_get_sve_cnt(void) {
|
|
| 23364 |
return 0;
|
| 23365 |
#endif
|
| 23366 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23367 |
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
| 319 |
// logging
|
| 320 |
//
|
| 321 |
|
| 322 |
+
struct ggml_logger_state {
|
| 323 |
+
ggml_log_callback log_callback;
|
| 324 |
+
void * log_callback_user_data;
|
| 325 |
+
};
|
| 326 |
+
static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
|
| 327 |
+
|
| 328 |
+
static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
|
| 329 |
+
if (format == NULL)
|
| 330 |
+
return;
|
| 331 |
+
va_list args_copy;
|
| 332 |
+
va_copy(args_copy, args);
|
| 333 |
+
char buffer[128];
|
| 334 |
+
int len = vsnprintf(buffer, 128, format, args);
|
| 335 |
+
if (len < 128) {
|
| 336 |
+
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
|
| 337 |
+
} else {
|
| 338 |
+
char * buffer2 = (char *) calloc(len + 1, sizeof(char));
|
| 339 |
+
vsnprintf(buffer2, len + 1, format, args_copy);
|
| 340 |
+
buffer2[len] = 0;
|
| 341 |
+
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
|
| 342 |
+
free(buffer2);
|
| 343 |
+
}
|
| 344 |
+
va_end(args_copy);
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
|
| 348 |
+
va_list args;
|
| 349 |
+
va_start(args, format);
|
| 350 |
+
ggml_log_internal_v(level, format, args);
|
| 351 |
+
va_end(args);
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
|
| 355 |
+
(void) level;
|
| 356 |
+
(void) user_data;
|
| 357 |
+
fputs(text, stderr);
|
| 358 |
+
fflush(stderr);
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
#if (GGML_DEBUG >= 1)
|
| 362 |
+
#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
| 363 |
#else
|
| 364 |
#define GGML_PRINT_DEBUG(...)
|
| 365 |
#endif
|
| 366 |
|
| 367 |
#if (GGML_DEBUG >= 5)
|
| 368 |
+
#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
| 369 |
#else
|
| 370 |
#define GGML_PRINT_DEBUG_5(...)
|
| 371 |
#endif
|
| 372 |
|
| 373 |
#if (GGML_DEBUG >= 10)
|
| 374 |
+
#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
| 375 |
#else
|
| 376 |
#define GGML_PRINT_DEBUG_10(...)
|
| 377 |
#endif
|
| 378 |
|
|
|
|
|
|
|
| 379 |
//
|
| 380 |
// end of logging block
|
| 381 |
//
|
|
|
|
| 392 |
#else
|
| 393 |
inline static void * ggml_aligned_malloc(size_t size) {
|
| 394 |
if (size == 0) {
|
| 395 |
+
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
| 396 |
return NULL;
|
| 397 |
}
|
| 398 |
void * aligned_memory = NULL;
|
|
|
|
| 414 |
error_desc = "insufficient memory";
|
| 415 |
break;
|
| 416 |
}
|
| 417 |
+
GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
| 418 |
GGML_ABORT("fatal error");
|
| 419 |
return NULL;
|
| 420 |
}
|
|
|
|
| 430 |
|
| 431 |
inline static void * ggml_malloc(size_t size) {
|
| 432 |
if (size == 0) {
|
| 433 |
+
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
|
| 434 |
return NULL;
|
| 435 |
}
|
| 436 |
void * result = malloc(size);
|
| 437 |
if (result == NULL) {
|
| 438 |
+
GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
| 439 |
GGML_ABORT("fatal error");
|
| 440 |
}
|
| 441 |
return result;
|
|
|
|
| 444 |
// calloc
|
| 445 |
inline static void * ggml_calloc(size_t num, size_t size) {
|
| 446 |
if (num == 0 || size == 0) {
|
| 447 |
+
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
|
| 448 |
return NULL;
|
| 449 |
}
|
| 450 |
void * result = calloc(num, size);
|
| 451 |
if (result == NULL) {
|
| 452 |
+
GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
| 453 |
GGML_ABORT("fatal error");
|
| 454 |
}
|
| 455 |
return result;
|
|
|
|
| 3386 |
if (fptr != NULL) {
|
| 3387 |
char buf[42];
|
| 3388 |
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
|
| 3389 |
+
GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
|
| 3390 |
}
|
| 3391 |
fclose(fptr);
|
| 3392 |
}
|
|
|
|
| 3404 |
////////////////////////////////////////////////////////////////////////////////
|
| 3405 |
|
| 3406 |
void ggml_print_object(const struct ggml_object * obj) {
|
| 3407 |
+
GGML_LOG_INFO(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
|
| 3408 |
obj->type, obj->offs, obj->size, (const void *) obj->next);
|
| 3409 |
}
|
| 3410 |
|
| 3411 |
void ggml_print_objects(const struct ggml_context * ctx) {
|
| 3412 |
struct ggml_object * obj = ctx->objects_begin;
|
| 3413 |
|
| 3414 |
+
GGML_LOG_INFO("%s: objects in context %p:\n", __func__, (const void *) ctx);
|
| 3415 |
|
| 3416 |
while (obj != NULL) {
|
| 3417 |
ggml_print_object(obj);
|
| 3418 |
obj = obj->next;
|
| 3419 |
}
|
| 3420 |
|
| 3421 |
+
GGML_LOG_INFO("%s: --- end ---\n", __func__);
|
| 3422 |
}
|
| 3423 |
|
| 3424 |
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
|
|
|
| 4001 |
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
| 4002 |
|
| 4003 |
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
| 4004 |
+
GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
| 4005 |
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
|
| 4006 |
assert(false);
|
| 4007 |
return NULL;
|
|
|
|
| 4065 |
if (ctx->scratch.data != NULL) {
|
| 4066 |
// allocate tensor data in the scratch buffer
|
| 4067 |
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
| 4068 |
+
GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
| 4069 |
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
| 4070 |
assert(false);
|
| 4071 |
return NULL;
|
|
|
|
| 20173 |
}
|
| 20174 |
#else
|
| 20175 |
if (n_threads > threadpool->n_threads_max) {
|
| 20176 |
+
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
|
| 20177 |
n_threads = threadpool->n_threads_max;
|
| 20178 |
}
|
| 20179 |
|
|
|
|
| 20712 |
}
|
| 20713 |
|
| 20714 |
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
| 20715 |
+
GGML_LOG_INFO("=== GRAPH ===\n");
|
| 20716 |
|
| 20717 |
+
GGML_LOG_INFO("n_nodes = %d\n", cgraph->n_nodes);
|
| 20718 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 20719 |
struct ggml_tensor * node = cgraph->nodes[i];
|
| 20720 |
|
| 20721 |
+
GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
|
| 20722 |
i,
|
| 20723 |
node->ne[0], node->ne[1], node->ne[2],
|
| 20724 |
ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ");
|
| 20725 |
}
|
| 20726 |
|
| 20727 |
+
GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs);
|
| 20728 |
for (int i = 0; i < cgraph->n_leafs; i++) {
|
| 20729 |
struct ggml_tensor * node = cgraph->leafs[i];
|
| 20730 |
|
| 20731 |
+
GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
|
| 20732 |
i,
|
| 20733 |
node->ne[0], node->ne[1],
|
| 20734 |
ggml_op_name(node->op),
|
| 20735 |
ggml_get_name(node));
|
| 20736 |
}
|
| 20737 |
|
| 20738 |
+
GGML_LOG_INFO("========================================\n");
|
| 20739 |
}
|
| 20740 |
|
| 20741 |
// check if node is part of the graph
|
|
|
|
| 20906 |
|
| 20907 |
fclose(fp);
|
| 20908 |
|
| 20909 |
+
GGML_LOG_INFO("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
|
| 20910 |
}
|
| 20911 |
|
| 20912 |
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
| 23401 |
return 0;
|
| 23402 |
#endif
|
| 23403 |
}
|
| 23404 |
+
|
| 23405 |
+
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
| 23406 |
+
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
| 23407 |
+
g_logger_state.log_callback_user_data = user_data;
|
| 23408 |
+
}
|
| 23409 |
////////////////////////////////////////////////////////////////////////////////
|