OccamRazor commited on
Commit
f833987
·
unverified ·
1 Parent(s): 8f3eb65

Add OpenCL add kernel (llama/5151)

Browse files

* Add OpenCL add kernel

* Put add kernel into different string to stay within MSVC string length limit, disable float16 support due to bad results

Files changed (3) hide show
  1. ggml-opencl.cpp +84 -3
  2. ggml-opencl.h +1 -0
  3. ggml.c +11 -0
ggml-opencl.cpp CHANGED
@@ -714,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
714
  dst[row] = tmp[0];
715
  }
716
  }
717
-
718
  );
719
 
720
 
@@ -784,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
784
  dst[row] = tmp[0];
785
  }
786
  }
 
787
  );
788
 
789
 
@@ -799,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y
799
  }
800
  );
801
 
 
 
 
 
 
 
 
 
 
 
 
 
802
  #define CL_CHECK(err) \
803
  do { \
804
  cl_int err_ = (err); \
@@ -878,6 +890,7 @@ static std::string generate_kernels() {
878
  }
879
  src << mul_kernel << '\n';
880
  }
 
881
 
882
  return src.str();
883
  }
@@ -893,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl,
893
  static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
894
  static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
895
  static cl_kernel mul_f32_cl;
 
896
  static bool fp16_support;
897
 
898
  static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
@@ -1100,9 +1114,10 @@ void ggml_cl_init(void) {
1100
  char *ext_buffer = (char *)alloca(ext_str_size + 1);
1101
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
1102
  ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
 
1103
  // Check if ext_buffer contains cl_khr_fp16
1104
- fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
1105
- fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
1106
 
1107
  cl_context_properties properties[] = {
1108
  (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
@@ -1150,6 +1165,8 @@ void ggml_cl_init(void) {
1150
 
1151
  // mul kernel
1152
  CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
 
 
1153
  }
1154
 
1155
  static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@@ -1458,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
1458
  ggml_cl_mul_f32(src0, src1, dst);
1459
  }
1460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1461
  static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1462
  const int64_t ne00 = src0->ne[0];
1463
  const int64_t ne01 = src0->ne[1];
 
714
  dst[row] = tmp[0];
715
  }
716
  }
 
717
  );
718
 
719
 
 
783
  dst[row] = tmp[0];
784
  }
785
  }
786
+
787
  );
788
 
789
 
 
799
  }
800
  );
801
 
802
+ std::string add_template = MULTILINE_QUOTE(
803
+ __kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
804
+ const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
805
+
806
+ if (i >= get_global_size(0)) {
807
+ return;
808
+ }
809
+
810
+ dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
811
+ }
812
+ );
813
+
814
  #define CL_CHECK(err) \
815
  do { \
816
  cl_int err_ = (err); \
 
890
  }
891
  src << mul_kernel << '\n';
892
  }
893
+ src << add_template << '\n';
894
 
895
  return src.str();
896
  }
 
906
  static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
907
  static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
908
  static cl_kernel mul_f32_cl;
909
+ static cl_kernel add_f32_cl;
910
  static bool fp16_support;
911
 
912
  static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
 
1114
  char *ext_buffer = (char *)alloca(ext_str_size + 1);
1115
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
1116
  ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
1117
+ // Disabled due to faulty outputs
1118
  // Check if ext_buffer contains cl_khr_fp16
1119
+ fp16_support = false; // strstr(ext_buffer, "cl_khr_fp16") != NULL;
1120
+ // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
1121
 
1122
  cl_context_properties properties[] = {
1123
  (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
 
1165
 
1166
  // mul kernel
1167
  CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
1168
+
1169
+ CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
1170
  }
1171
 
1172
  static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
 
1475
  ggml_cl_mul_f32(src0, src1, dst);
1476
  }
1477
 
1478
+ static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1479
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
1480
+ const int64_t ne00 = src0->ne[0];
1481
+ const int64_t ne01 = src0->ne[1];
1482
+ const int64_t ne02 = src0->ne[2];
1483
+ const int64_t ne03 = src0->ne[3];
1484
+ const int64_t ne10 = src1->ne[0];
1485
+ const int64_t ne11 = src1->ne[1];
1486
+ const int64_t ne12 = src1->ne[2];
1487
+ const int64_t ne13 = src1->ne[3];
1488
+ const int nb2 = dst->nb[2];
1489
+ const int nb3 = dst->nb[3];
1490
+ size_t x_size;
1491
+ size_t d_size;
1492
+
1493
+ cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
1494
+ cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1495
+ cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
1496
+
1497
+
1498
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1499
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1500
+ cl_event ev;
1501
+
1502
+ // copy src0 to device
1503
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
1504
+
1505
+ const int64_t i13 = i03%ne13;
1506
+ const int64_t i12 = i02%ne12;
1507
+ const int i1 = i13*ne12*ne11 + i12*ne11;
1508
+
1509
+ cl_int x_offset = 0;
1510
+ cl_int y_offset = i1*ne10;
1511
+ cl_int d_offset = 0;
1512
+
1513
+ size_t global = ne00 * ne01;
1514
+ cl_int ky = ne10 * ne11;
1515
+
1516
+ CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
1517
+ CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
1518
+ CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
1519
+ CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
1520
+ CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
1521
+ CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
1522
+ CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
1523
+ CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1524
+
1525
+ CL_CHECK(clReleaseEvent(ev));
1526
+ CL_CHECK(clFinish(queue));
1527
+
1528
+ // copy dst to host
1529
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1530
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
1531
+ }
1532
+ }
1533
+ ggml_cl_pool_free(d_X, x_size);
1534
+ ggml_cl_pool_free(d_D, d_size);
1535
+ }
1536
+
1537
+ void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1538
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1539
+ ggml_cl_add_f32(src0, src1, dst);
1540
+ }
1541
+
1542
  static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1543
  const int64_t ne00 = src0->ne[0];
1544
  const int64_t ne01 = src0->ne[1];
ggml-opencl.h CHANGED
@@ -10,6 +10,7 @@ extern "C" {
10
  GGML_API void ggml_cl_init(void);
11
 
12
  GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 
13
  GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
14
  GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
15
  GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
10
  GGML_API void ggml_cl_init(void);
11
 
12
  GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
+ GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
14
  GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
15
  GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
16
  GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
ggml.c CHANGED
@@ -7207,6 +7207,17 @@ static void ggml_compute_forward_add_f32(
7207
  const int ith = params->ith;
7208
  const int nth = params->nth;
7209
 
 
 
 
 
 
 
 
 
 
 
 
7210
  const int nr = ggml_nrows(src0);
7211
 
7212
  GGML_TENSOR_BINARY_OP_LOCALS
 
7207
  const int ith = params->ith;
7208
  const int nth = params->nth;
7209
 
7210
+ #ifdef GGML_USE_CLBLAST
7211
+ if (src1->backend == GGML_BACKEND_GPU) {
7212
+ // TODO: OpenCL kernel support full broadcast
7213
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7214
+ if (ith == 0) {
7215
+ ggml_cl_add(src0, src1, dst);
7216
+ }
7217
+ return;
7218
+ }
7219
+ #endif
7220
+
7221
  const int nr = ggml_nrows(src0);
7222
 
7223
  GGML_TENSOR_BINARY_OP_LOCALS