Spaces:
Running
Running
Add OpenCL add kernel (llama/5151)
Browse files* Add OpenCL add kernel
* Put add kernel into different string to stay within MSVC string length limit, disable float16 support due to bad results
- ggml-opencl.cpp +84 -3
- ggml-opencl.h +1 -0
- ggml.c +11 -0
ggml-opencl.cpp
CHANGED
|
@@ -714,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
| 714 |
dst[row] = tmp[0];
|
| 715 |
}
|
| 716 |
}
|
| 717 |
-
|
| 718 |
);
|
| 719 |
|
| 720 |
|
|
@@ -784,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
| 784 |
dst[row] = tmp[0];
|
| 785 |
}
|
| 786 |
}
|
|
|
|
| 787 |
);
|
| 788 |
|
| 789 |
|
|
@@ -799,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y
|
|
| 799 |
}
|
| 800 |
);
|
| 801 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 802 |
#define CL_CHECK(err) \
|
| 803 |
do { \
|
| 804 |
cl_int err_ = (err); \
|
|
@@ -878,6 +890,7 @@ static std::string generate_kernels() {
|
|
| 878 |
}
|
| 879 |
src << mul_kernel << '\n';
|
| 880 |
}
|
|
|
|
| 881 |
|
| 882 |
return src.str();
|
| 883 |
}
|
|
@@ -893,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl,
|
|
| 893 |
static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
|
| 894 |
static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
|
| 895 |
static cl_kernel mul_f32_cl;
|
|
|
|
| 896 |
static bool fp16_support;
|
| 897 |
|
| 898 |
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
|
@@ -1100,9 +1114,10 @@ void ggml_cl_init(void) {
|
|
| 1100 |
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
| 1101 |
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
| 1102 |
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
|
|
|
| 1103 |
// Check if ext_buffer contains cl_khr_fp16
|
| 1104 |
-
fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
| 1105 |
-
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
| 1106 |
|
| 1107 |
cl_context_properties properties[] = {
|
| 1108 |
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
|
|
@@ -1150,6 +1165,8 @@ void ggml_cl_init(void) {
|
|
| 1150 |
|
| 1151 |
// mul kernel
|
| 1152 |
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
|
|
|
|
|
|
| 1153 |
}
|
| 1154 |
|
| 1155 |
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
|
@@ -1458,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
|
|
| 1458 |
ggml_cl_mul_f32(src0, src1, dst);
|
| 1459 |
}
|
| 1460 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1461 |
static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 1462 |
const int64_t ne00 = src0->ne[0];
|
| 1463 |
const int64_t ne01 = src0->ne[1];
|
|
|
|
| 714 |
dst[row] = tmp[0];
|
| 715 |
}
|
| 716 |
}
|
|
|
|
| 717 |
);
|
| 718 |
|
| 719 |
|
|
|
|
| 783 |
dst[row] = tmp[0];
|
| 784 |
}
|
| 785 |
}
|
| 786 |
+
|
| 787 |
);
|
| 788 |
|
| 789 |
|
|
|
|
| 799 |
}
|
| 800 |
);
|
| 801 |
|
| 802 |
+
std::string add_template = MULTILINE_QUOTE(
|
| 803 |
+
__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
|
| 804 |
+
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
|
| 805 |
+
|
| 806 |
+
if (i >= get_global_size(0)) {
|
| 807 |
+
return;
|
| 808 |
+
}
|
| 809 |
+
|
| 810 |
+
dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
|
| 811 |
+
}
|
| 812 |
+
);
|
| 813 |
+
|
| 814 |
#define CL_CHECK(err) \
|
| 815 |
do { \
|
| 816 |
cl_int err_ = (err); \
|
|
|
|
| 890 |
}
|
| 891 |
src << mul_kernel << '\n';
|
| 892 |
}
|
| 893 |
+
src << add_template << '\n';
|
| 894 |
|
| 895 |
return src.str();
|
| 896 |
}
|
|
|
|
| 906 |
static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
|
| 907 |
static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
|
| 908 |
static cl_kernel mul_f32_cl;
|
| 909 |
+
static cl_kernel add_f32_cl;
|
| 910 |
static bool fp16_support;
|
| 911 |
|
| 912 |
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
|
|
|
| 1114 |
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
| 1115 |
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
| 1116 |
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
| 1117 |
+
// Disabled due to faulty outputs
|
| 1118 |
// Check if ext_buffer contains cl_khr_fp16
|
| 1119 |
+
fp16_support = false; // strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
| 1120 |
+
// fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
| 1121 |
|
| 1122 |
cl_context_properties properties[] = {
|
| 1123 |
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
|
|
|
|
| 1165 |
|
| 1166 |
// mul kernel
|
| 1167 |
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
| 1168 |
+
|
| 1169 |
+
CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
|
| 1170 |
}
|
| 1171 |
|
| 1172 |
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
|
|
|
| 1475 |
ggml_cl_mul_f32(src0, src1, dst);
|
| 1476 |
}
|
| 1477 |
|
| 1478 |
+
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 1479 |
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
| 1480 |
+
const int64_t ne00 = src0->ne[0];
|
| 1481 |
+
const int64_t ne01 = src0->ne[1];
|
| 1482 |
+
const int64_t ne02 = src0->ne[2];
|
| 1483 |
+
const int64_t ne03 = src0->ne[3];
|
| 1484 |
+
const int64_t ne10 = src1->ne[0];
|
| 1485 |
+
const int64_t ne11 = src1->ne[1];
|
| 1486 |
+
const int64_t ne12 = src1->ne[2];
|
| 1487 |
+
const int64_t ne13 = src1->ne[3];
|
| 1488 |
+
const int nb2 = dst->nb[2];
|
| 1489 |
+
const int nb3 = dst->nb[3];
|
| 1490 |
+
size_t x_size;
|
| 1491 |
+
size_t d_size;
|
| 1492 |
+
|
| 1493 |
+
cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
|
| 1494 |
+
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
| 1495 |
+
cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
|
| 1496 |
+
|
| 1497 |
+
|
| 1498 |
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
| 1499 |
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
| 1500 |
+
cl_event ev;
|
| 1501 |
+
|
| 1502 |
+
// copy src0 to device
|
| 1503 |
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
|
| 1504 |
+
|
| 1505 |
+
const int64_t i13 = i03%ne13;
|
| 1506 |
+
const int64_t i12 = i02%ne12;
|
| 1507 |
+
const int i1 = i13*ne12*ne11 + i12*ne11;
|
| 1508 |
+
|
| 1509 |
+
cl_int x_offset = 0;
|
| 1510 |
+
cl_int y_offset = i1*ne10;
|
| 1511 |
+
cl_int d_offset = 0;
|
| 1512 |
+
|
| 1513 |
+
size_t global = ne00 * ne01;
|
| 1514 |
+
cl_int ky = ne10 * ne11;
|
| 1515 |
+
|
| 1516 |
+
CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
|
| 1517 |
+
CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
|
| 1518 |
+
CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
| 1519 |
+
CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
|
| 1520 |
+
CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
|
| 1521 |
+
CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
|
| 1522 |
+
CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
|
| 1523 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
| 1524 |
+
|
| 1525 |
+
CL_CHECK(clReleaseEvent(ev));
|
| 1526 |
+
CL_CHECK(clFinish(queue));
|
| 1527 |
+
|
| 1528 |
+
// copy dst to host
|
| 1529 |
+
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
| 1530 |
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
|
| 1531 |
+
}
|
| 1532 |
+
}
|
| 1533 |
+
ggml_cl_pool_free(d_X, x_size);
|
| 1534 |
+
ggml_cl_pool_free(d_D, d_size);
|
| 1535 |
+
}
|
| 1536 |
+
|
| 1537 |
+
void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
| 1538 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
| 1539 |
+
ggml_cl_add_f32(src0, src1, dst);
|
| 1540 |
+
}
|
| 1541 |
+
|
| 1542 |
static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 1543 |
const int64_t ne00 = src0->ne[0];
|
| 1544 |
const int64_t ne01 = src0->ne[1];
|
ggml-opencl.h
CHANGED
|
@@ -10,6 +10,7 @@ extern "C" {
|
|
| 10 |
GGML_API void ggml_cl_init(void);
|
| 11 |
|
| 12 |
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
|
|
| 13 |
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
| 14 |
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 15 |
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
|
|
|
| 10 |
GGML_API void ggml_cl_init(void);
|
| 11 |
|
| 12 |
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 13 |
+
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 14 |
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
| 15 |
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 16 |
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
ggml.c
CHANGED
|
@@ -7207,6 +7207,17 @@ static void ggml_compute_forward_add_f32(
|
|
| 7207 |
const int ith = params->ith;
|
| 7208 |
const int nth = params->nth;
|
| 7209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7210 |
const int nr = ggml_nrows(src0);
|
| 7211 |
|
| 7212 |
GGML_TENSOR_BINARY_OP_LOCALS
|
|
|
|
| 7207 |
const int ith = params->ith;
|
| 7208 |
const int nth = params->nth;
|
| 7209 |
|
| 7210 |
+
#ifdef GGML_USE_CLBLAST
|
| 7211 |
+
if (src1->backend == GGML_BACKEND_GPU) {
|
| 7212 |
+
// TODO: OpenCL kernel support full broadcast
|
| 7213 |
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
| 7214 |
+
if (ith == 0) {
|
| 7215 |
+
ggml_cl_add(src0, src1, dst);
|
| 7216 |
+
}
|
| 7217 |
+
return;
|
| 7218 |
+
}
|
| 7219 |
+
#endif
|
| 7220 |
+
|
| 7221 |
const int nr = ggml_nrows(src0);
|
| 7222 |
|
| 7223 |
GGML_TENSOR_BINARY_OP_LOCALS
|