ggerganov commited on
Commit
93e0830
·
unverified ·
1 Parent(s): 32589c9

code : normalize enum names (llama/5697)

Browse files

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

Files changed (7) hide show
  1. ggml-cuda.cu +69 -69
  2. ggml-metal.m +2 -2
  3. ggml-opencl.cpp +25 -25
  4. ggml-sycl.cpp +76 -76
  5. ggml-vulkan.cpp +51 -51
  6. ggml.c +175 -175
  7. ggml.h +19 -19
ggml-cuda.cu CHANGED
@@ -6369,11 +6369,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
6369
  int ixj = col ^ j;
6370
  if (ixj > col) {
6371
  if ((col & k) == 0) {
6372
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
6373
  swap(dst_row[col], dst_row[ixj]);
6374
  }
6375
  } else {
6376
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
6377
  swap(dst_row[col], dst_row[ixj]);
6378
  }
6379
  }
@@ -7927,10 +7927,10 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
7927
 
7928
  const dim3 block_dims(ncols, 1, 1);
7929
  const dim3 block_nums(1, nrows, 1);
7930
- if (order == GGML_SORT_ASC) {
7931
- k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7932
- } else if (order == GGML_SORT_DESC) {
7933
- k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7934
  } else {
7935
  GGML_ASSERT(false);
7936
  }
@@ -8362,11 +8362,11 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
8362
 
8363
  cudaMemcpyKind kind;
8364
  char * src_ptr;
8365
- if (src->backend == GGML_BACKEND_CPU) {
8366
  kind = cudaMemcpyHostToDevice;
8367
  src_ptr = (char *) src->data;
8368
- } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
8369
- GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
8370
  kind = cudaMemcpyDeviceToDevice;
8371
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
8372
  int id;
@@ -8771,7 +8771,7 @@ static void ggml_cuda_op_mul_mat_q(
8771
 
8772
  // the main device has a larger memory buffer to hold the results from all GPUs
8773
  // nrows_dst == nrows of the matrix that the kernel writes into
8774
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
8775
 
8776
  switch (src0->type) {
8777
  case GGML_TYPE_Q4_0:
@@ -8920,7 +8920,7 @@ static void ggml_cuda_op_mul_mat_vec_q(
8920
 
8921
  // the main device has a larger memory buffer to hold the results from all GPUs
8922
  // nrows_dst == nrows of the matrix that the kernel writes into
8923
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
8924
 
8925
  switch (src0->type) {
8926
  case GGML_TYPE_Q4_0:
@@ -9096,7 +9096,7 @@ static void ggml_cuda_op_mul_mat_cublas(
9096
 
9097
  // the main device has a larger memory buffer to hold the results from all GPUs
9098
  // ldc == nrows of the matrix that cuBLAS writes into
9099
- int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
9100
 
9101
  const int compute_capability = g_device_caps[id].cc;
9102
 
@@ -9444,7 +9444,7 @@ static void ggml_cuda_op_soft_max(
9444
  const bool use_src2 = src2 != nullptr;
9445
 
9446
  if (use_src2) {
9447
- const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
9448
 
9449
  if (src2_on_device) {
9450
  ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
@@ -9502,16 +9502,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
9502
  const bool use_src1 = src1 != nullptr;
9503
  const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
9504
 
9505
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
9506
- GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
9507
 
9508
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
9509
  ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
9510
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
9511
 
9512
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
9513
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
9514
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
9515
 
9516
  // dd = data device
9517
  float * src0_ddf = nullptr;
@@ -9555,7 +9555,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
9555
  CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
9556
  }
9557
 
9558
- if (dst->backend == GGML_BACKEND_CPU) {
9559
  CUDA_CHECK(cudaDeviceSynchronize());
9560
  }
9561
  }
@@ -9636,8 +9636,8 @@ static void ggml_cuda_op_mul_mat(
9636
  const int nb2 = dst->nb[2];
9637
  const int nb3 = dst->nb[3];
9638
 
9639
- GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
9640
- GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
9641
  GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
9642
 
9643
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
@@ -9653,20 +9653,20 @@ static void ggml_cuda_op_mul_mat(
9653
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
9654
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
9655
 
9656
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
9657
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
9658
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
9659
 
9660
  const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
9661
 
9662
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
9663
  GGML_ASSERT(!(split && ne02 > 1));
9664
  GGML_ASSERT(!(split && ne03 > 1));
9665
  GGML_ASSERT(!(split && ne02 < ne12));
9666
 
9667
  std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
9668
  if (split) {
9669
- // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
9670
  // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
9671
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
9672
  tensor_split = buft_ctx->tensor_split;
@@ -9724,8 +9724,8 @@ static void ggml_cuda_op_mul_mat(
9724
 
9725
  used_devices++;
9726
 
9727
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
9728
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
9729
 
9730
  ggml_cuda_set_device(id);
9731
  cudaStream_t stream = g_cudaStreams[id][0];
@@ -9776,8 +9776,8 @@ static void ggml_cuda_op_mul_mat(
9776
  continue;
9777
  }
9778
 
9779
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
9780
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
9781
  const int64_t row_diff = dev[id].row_high - dev[id].row_low;
9782
 
9783
  ggml_cuda_set_device(id);
@@ -9802,12 +9802,12 @@ static void ggml_cuda_op_mul_mat(
9802
 
9803
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
9804
  // in that case an offset on dst_ddf_i is needed
9805
- if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
9806
  dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
9807
  }
9808
 
9809
  // copy src0, src1 to device if necessary
9810
- if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
9811
  if (id != g_main_device) {
9812
  if (convert_src1_to_q8_1) {
9813
  char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
@@ -9820,14 +9820,14 @@ static void ggml_cuda_op_mul_mat(
9820
  src1_ncols*ne10*sizeof(float), stream));
9821
  }
9822
  }
9823
- } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
9824
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
9825
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
9826
  } else {
9827
  GGML_ASSERT(false);
9828
  }
9829
 
9830
- if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
9831
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
9832
  CUDA_CHECK(cudaGetLastError());
9833
  }
@@ -9845,10 +9845,10 @@ static void ggml_cuda_op_mul_mat(
9845
  if (!dst_on_device) {
9846
  void * dst_off_device;
9847
  cudaMemcpyKind kind;
9848
- if (dst->backend == GGML_BACKEND_CPU) {
9849
  dst_off_device = dst->data;
9850
  kind = cudaMemcpyDeviceToHost;
9851
- } else if (dst->backend == GGML_BACKEND_GPU) {
9852
  dst_off_device = dst_extra->data_device[g_main_device];
9853
  kind = cudaMemcpyDeviceToDevice;
9854
  } else {
@@ -9913,7 +9913,7 @@ static void ggml_cuda_op_mul_mat(
9913
  }
9914
  }
9915
 
9916
- if (dst->backend == GGML_BACKEND_CPU) {
9917
  ggml_cuda_set_device(g_main_device);
9918
  CUDA_CHECK(cudaDeviceSynchronize());
9919
  }
@@ -10019,7 +10019,7 @@ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const stru
10019
 
10020
  static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
10021
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
10022
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
10023
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
10024
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
10025
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -10050,7 +10050,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
10050
  GGML_ASSERT(!ggml_is_transposed(src0));
10051
  GGML_ASSERT(!ggml_is_transposed(src1));
10052
  GGML_ASSERT(!ggml_is_permuted(src0));
10053
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
10054
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
10055
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
10056
 
@@ -10109,7 +10109,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
10109
  GGML_ASSERT(!ggml_is_transposed(src0));
10110
  GGML_ASSERT(!ggml_is_transposed(src1));
10111
 
10112
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
10113
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
10114
 
10115
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -10255,11 +10255,11 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
10255
 
10256
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10257
  const bool all_on_device =
10258
- (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
10259
- (src1->backend == GGML_BACKEND_GPU) &&
10260
- ( dst->backend == GGML_BACKEND_GPU);
10261
 
10262
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
10263
 
10264
  int64_t min_compute_capability = INT_MAX;
10265
 
@@ -10409,7 +10409,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
10409
  GGML_ASSERT(!ggml_is_transposed(src00));
10410
  GGML_ASSERT(!ggml_is_transposed(src1));
10411
 
10412
- GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
10413
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
10414
 
10415
  const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
@@ -10553,7 +10553,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
10553
 
10554
  cudaStream_t stream = g_cudaStreams[g_main_device][0];
10555
 
10556
- if (ids->backend == GGML_BACKEND_GPU) {
10557
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
10558
  CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
10559
  CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -10570,20 +10570,20 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
10570
  ggml_tensor src1_row = *src1;
10571
  ggml_tensor dst_row = *dst;
10572
 
10573
- src1_row.backend = GGML_BACKEND_GPU;
10574
- dst_row.backend = GGML_BACKEND_GPU;
10575
 
10576
  src1_row.extra = &src1_row_extra;
10577
  dst_row.extra = &dst_row_extra;
10578
 
10579
- char * src1_original = src1->backend == GGML_BACKEND_CPU ?
10580
  (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
10581
- char * dst_original = dst->backend == GGML_BACKEND_CPU ?
10582
  (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
10583
 
10584
  if (src1->ne[1] == 1) {
10585
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
10586
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
10587
 
10588
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
10589
  //int32_t row_id;
@@ -10611,9 +10611,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
10611
  src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
10612
  dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
10613
 
10614
- const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
10615
  cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
10616
- const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
10617
  cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
10618
 
10619
  for (int32_t row_id = 0; row_id < n_as; ++row_id) {
@@ -10668,7 +10668,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
10668
  }
10669
  }
10670
 
10671
- if (dst->backend == GGML_BACKEND_CPU) {
10672
  CUDA_CHECK(cudaStreamSynchronize(stream));
10673
  }
10674
  }
@@ -10685,8 +10685,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
10685
  const int64_t ne = ggml_nelements(src0);
10686
  GGML_ASSERT(ne == ggml_nelements(src1));
10687
 
10688
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
10689
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
10690
 
10691
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
10692
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
@@ -10817,9 +10817,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
10817
  if (!g_cublas_loaded) return false;
10818
 
10819
  ggml_cuda_func_t func;
10820
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
10821
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
10822
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
10823
 
10824
  if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
10825
  return false;
@@ -10966,14 +10966,14 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
10966
  return false;
10967
  }
10968
 
10969
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
10970
  ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
10971
  }
10972
 
10973
  if (params->ith != 0) {
10974
  return true;
10975
  }
10976
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10977
  return true;
10978
  }
10979
  func(tensor->src[0], tensor->src[1], tensor);
@@ -11072,7 +11072,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
11072
 
11073
  extra->data_device[ctx->device] = tensor->data;
11074
 
11075
- tensor->backend = GGML_BACKEND_GPU;
11076
  tensor->extra = extra;
11077
 
11078
  if (ggml_is_quantized(tensor->type)) {
@@ -11087,7 +11087,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
11087
  }
11088
 
11089
  GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
11090
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
11091
 
11092
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
11093
 
@@ -11098,7 +11098,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
11098
  }
11099
 
11100
  GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
11101
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
11102
 
11103
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
11104
 
@@ -11333,7 +11333,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
11333
  CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
11334
  }
11335
  }
11336
- tensor->backend = GGML_BACKEND_GPU_SPLIT;
11337
  tensor->extra = extra;
11338
  }
11339
 
@@ -11605,7 +11605,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
11605
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11606
 
11607
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11608
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
11609
 
11610
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
11611
  }
@@ -11614,7 +11614,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
11614
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11615
 
11616
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11617
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
11618
 
11619
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
11620
  }
@@ -11644,7 +11644,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
11644
  ggml_cuda_set_main_device(cuda_ctx->device);
11645
 
11646
  ggml_compute_params params = {};
11647
- params.type = GGML_TASK_COMPUTE;
11648
  params.ith = 0;
11649
  for (int i = 0; i < cgraph->n_nodes; i++) {
11650
  ggml_tensor * node = cgraph->nodes[i];
@@ -11654,13 +11654,13 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
11654
  }
11655
 
11656
  #ifndef NDEBUG
11657
- assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
11658
  assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
11659
  assert(node->extra != nullptr);
11660
 
11661
  for (int j = 0; j < GGML_MAX_SRC; j++) {
11662
  if (node->src[j] != nullptr) {
11663
- assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
11664
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
11665
  assert(node->src[j]->extra != nullptr);
11666
  }
 
6369
  int ixj = col ^ j;
6370
  if (ixj > col) {
6371
  if ((col & k) == 0) {
6372
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
6373
  swap(dst_row[col], dst_row[ixj]);
6374
  }
6375
  } else {
6376
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
6377
  swap(dst_row[col], dst_row[ixj]);
6378
  }
6379
  }
 
7927
 
7928
  const dim3 block_dims(ncols, 1, 1);
7929
  const dim3 block_nums(1, nrows, 1);
7930
+ if (order == GGML_SORT_ORDER_ASC) {
7931
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7932
+ } else if (order == GGML_SORT_ORDER_DESC) {
7933
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7934
  } else {
7935
  GGML_ASSERT(false);
7936
  }
 
8362
 
8363
  cudaMemcpyKind kind;
8364
  char * src_ptr;
8365
+ if (src->backend == GGML_BACKEND_TYPE_CPU) {
8366
  kind = cudaMemcpyHostToDevice;
8367
  src_ptr = (char *) src->data;
8368
+ } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
8369
+ GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
8370
  kind = cudaMemcpyDeviceToDevice;
8371
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
8372
  int id;
 
8771
 
8772
  // the main device has a larger memory buffer to hold the results from all GPUs
8773
  // nrows_dst == nrows of the matrix that the kernel writes into
8774
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
8775
 
8776
  switch (src0->type) {
8777
  case GGML_TYPE_Q4_0:
 
8920
 
8921
  // the main device has a larger memory buffer to hold the results from all GPUs
8922
  // nrows_dst == nrows of the matrix that the kernel writes into
8923
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
8924
 
8925
  switch (src0->type) {
8926
  case GGML_TYPE_Q4_0:
 
9096
 
9097
  // the main device has a larger memory buffer to hold the results from all GPUs
9098
  // ldc == nrows of the matrix that cuBLAS writes into
9099
+ int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
9100
 
9101
  const int compute_capability = g_device_caps[id].cc;
9102
 
 
9444
  const bool use_src2 = src2 != nullptr;
9445
 
9446
  if (use_src2) {
9447
+ const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
9448
 
9449
  if (src2_on_device) {
9450
  ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
 
9502
  const bool use_src1 = src1 != nullptr;
9503
  const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
9504
 
9505
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9506
+ GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9507
 
9508
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
9509
  ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
9510
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
9511
 
9512
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
9513
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
9514
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
9515
 
9516
  // dd = data device
9517
  float * src0_ddf = nullptr;
 
9555
  CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
9556
  }
9557
 
9558
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
9559
  CUDA_CHECK(cudaDeviceSynchronize());
9560
  }
9561
  }
 
9636
  const int nb2 = dst->nb[2];
9637
  const int nb3 = dst->nb[3];
9638
 
9639
+ GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9640
+ GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9641
  GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
9642
 
9643
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
 
9653
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
9654
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
9655
 
9656
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
9657
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
9658
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
9659
 
9660
  const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
9661
 
9662
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
9663
  GGML_ASSERT(!(split && ne02 > 1));
9664
  GGML_ASSERT(!(split && ne03 > 1));
9665
  GGML_ASSERT(!(split && ne02 < ne12));
9666
 
9667
  std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
9668
  if (split) {
9669
+ // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check
9670
  // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
9671
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
9672
  tensor_split = buft_ctx->tensor_split;
 
9724
 
9725
  used_devices++;
9726
 
9727
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
9728
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
9729
 
9730
  ggml_cuda_set_device(id);
9731
  cudaStream_t stream = g_cudaStreams[id][0];
 
9776
  continue;
9777
  }
9778
 
9779
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
9780
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
9781
  const int64_t row_diff = dev[id].row_high - dev[id].row_low;
9782
 
9783
  ggml_cuda_set_device(id);
 
9802
 
9803
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
9804
  // in that case an offset on dst_ddf_i is needed
9805
+ if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
9806
  dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
9807
  }
9808
 
9809
  // copy src0, src1 to device if necessary
9810
+ if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
9811
  if (id != g_main_device) {
9812
  if (convert_src1_to_q8_1) {
9813
  char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
 
9820
  src1_ncols*ne10*sizeof(float), stream));
9821
  }
9822
  }
9823
+ } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
9824
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
9825
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
9826
  } else {
9827
  GGML_ASSERT(false);
9828
  }
9829
 
9830
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
9831
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
9832
  CUDA_CHECK(cudaGetLastError());
9833
  }
 
9845
  if (!dst_on_device) {
9846
  void * dst_off_device;
9847
  cudaMemcpyKind kind;
9848
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
9849
  dst_off_device = dst->data;
9850
  kind = cudaMemcpyDeviceToHost;
9851
+ } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
9852
  dst_off_device = dst_extra->data_device[g_main_device];
9853
  kind = cudaMemcpyDeviceToDevice;
9854
  } else {
 
9913
  }
9914
  }
9915
 
9916
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
9917
  ggml_cuda_set_device(g_main_device);
9918
  CUDA_CHECK(cudaDeviceSynchronize());
9919
  }
 
10019
 
10020
  static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
10021
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
10022
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
10023
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
10024
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
10025
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
 
10050
  GGML_ASSERT(!ggml_is_transposed(src0));
10051
  GGML_ASSERT(!ggml_is_transposed(src1));
10052
  GGML_ASSERT(!ggml_is_permuted(src0));
10053
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
10054
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
10055
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
10056
 
 
10109
  GGML_ASSERT(!ggml_is_transposed(src0));
10110
  GGML_ASSERT(!ggml_is_transposed(src1));
10111
 
10112
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
10113
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
10114
 
10115
  GGML_TENSOR_BINARY_OP_LOCALS
 
10255
 
10256
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10257
  const bool all_on_device =
10258
+ (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
10259
+ (src1->backend == GGML_BACKEND_TYPE_GPU) &&
10260
+ ( dst->backend == GGML_BACKEND_TYPE_GPU);
10261
 
10262
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
10263
 
10264
  int64_t min_compute_capability = INT_MAX;
10265
 
 
10409
  GGML_ASSERT(!ggml_is_transposed(src00));
10410
  GGML_ASSERT(!ggml_is_transposed(src1));
10411
 
10412
+ GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
10413
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
10414
 
10415
  const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
 
10553
 
10554
  cudaStream_t stream = g_cudaStreams[g_main_device][0];
10555
 
10556
+ if (ids->backend == GGML_BACKEND_TYPE_GPU) {
10557
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
10558
  CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
10559
  CUDA_CHECK(cudaStreamSynchronize(stream));
 
10570
  ggml_tensor src1_row = *src1;
10571
  ggml_tensor dst_row = *dst;
10572
 
10573
+ src1_row.backend = GGML_BACKEND_TYPE_GPU;
10574
+ dst_row.backend = GGML_BACKEND_TYPE_GPU;
10575
 
10576
  src1_row.extra = &src1_row_extra;
10577
  dst_row.extra = &dst_row_extra;
10578
 
10579
+ char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
10580
  (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
10581
+ char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
10582
  (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
10583
 
10584
  if (src1->ne[1] == 1) {
10585
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
10586
+ GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
10587
 
10588
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
10589
  //int32_t row_id;
 
10611
  src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
10612
  dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
10613
 
10614
+ const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
10615
  cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
10616
+ const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ?
10617
  cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
10618
 
10619
  for (int32_t row_id = 0; row_id < n_as; ++row_id) {
 
10668
  }
10669
  }
10670
 
10671
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
10672
  CUDA_CHECK(cudaStreamSynchronize(stream));
10673
  }
10674
  }
 
10685
  const int64_t ne = ggml_nelements(src0);
10686
  GGML_ASSERT(ne == ggml_nelements(src1));
10687
 
10688
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
10689
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
10690
 
10691
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
10692
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
 
10817
  if (!g_cublas_loaded) return false;
10818
 
10819
  ggml_cuda_func_t func;
10820
+ const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
10821
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
10822
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
10823
 
10824
  if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
10825
  return false;
 
10966
  return false;
10967
  }
10968
 
10969
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
10970
  ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
10971
  }
10972
 
10973
  if (params->ith != 0) {
10974
  return true;
10975
  }
10976
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
10977
  return true;
10978
  }
10979
  func(tensor->src[0], tensor->src[1], tensor);
 
11072
 
11073
  extra->data_device[ctx->device] = tensor->data;
11074
 
11075
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
11076
  tensor->extra = extra;
11077
 
11078
  if (ggml_is_quantized(tensor->type)) {
 
11087
  }
11088
 
11089
  GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
11090
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
11091
 
11092
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
11093
 
 
11098
  }
11099
 
11100
  GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
11101
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
11102
 
11103
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
11104
 
 
11333
  CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
11334
  }
11335
  }
11336
+ tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
11337
  tensor->extra = extra;
11338
  }
11339
 
 
11605
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11606
 
11607
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11608
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
11609
 
11610
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
11611
  }
 
11614
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11615
 
11616
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11617
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
11618
 
11619
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
11620
  }
 
11644
  ggml_cuda_set_main_device(cuda_ctx->device);
11645
 
11646
  ggml_compute_params params = {};
11647
+ params.type = GGML_TASK_TYPE_COMPUTE;
11648
  params.ith = 0;
11649
  for (int i = 0; i < cgraph->n_nodes; i++) {
11650
  ggml_tensor * node = cgraph->nodes[i];
 
11654
  }
11655
 
11656
  #ifndef NDEBUG
11657
+ assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
11658
  assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
11659
  assert(node->extra != nullptr);
11660
 
11661
  for (int j = 0; j < GGML_MAX_SRC; j++) {
11662
  if (node->src[j] != nullptr) {
11663
+ assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
11664
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
11665
  assert(node->src[j]->extra != nullptr);
11666
  }
ggml-metal.m CHANGED
@@ -2262,8 +2262,8 @@ static bool ggml_metal_graph_compute(
2262
  id<MTLComputePipelineState> pipeline = nil;
2263
 
2264
  switch (order) {
2265
- case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
2266
- case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
2267
  default: GGML_ASSERT(false);
2268
  };
2269
 
 
2262
  id<MTLComputePipelineState> pipeline = nil;
2263
 
2264
  switch (order) {
2265
+ case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
2266
+ case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
2267
  default: GGML_ASSERT(false);
2268
  };
2269
 
ggml-opencl.cpp CHANGED
@@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
1354
  }
1355
 
1356
  void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1357
- if (tensor->backend != GGML_BACKEND_GPU) {
1358
  return;
1359
  }
1360
 
@@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
1412
  }
1413
 
1414
  static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1415
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
1416
  const int64_t ne00 = src0->ne[0];
1417
  const int64_t ne01 = src0->ne[1];
1418
  const int64_t ne02 = src0->ne[2];
@@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
1476
  }
1477
 
1478
  static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1479
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
1480
  const int64_t ne00 = src0->ne[0];
1481
  const int64_t ne01 = src0->ne[1];
1482
  const int64_t ne02 = src0->ne[2];
@@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1566
  size_t y_size;
1567
  size_t d_size;
1568
  cl_mem d_X;
1569
- if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1570
  d_X = (cl_mem) src0->extra;
1571
  } else {
1572
  d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1573
  }
1574
- cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1575
- cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1576
 
1577
  size_t x_offset = 0;
1578
 
@@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1580
  // TODO: copy src0 here when r3>1
1581
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1582
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1583
- if (src0->backend == GGML_BACKEND_GPU) {
1584
  x_offset = (i03 * ne02 + i02) * x_ne;
1585
  } else {
1586
  // copy src0 to device
@@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1589
 
1590
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1591
  // copy src1 to device
1592
- if (src1->backend == GGML_BACKEND_CPU) {
1593
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1594
  }
1595
 
@@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1612
  }
1613
 
1614
  // copy dst to host
1615
- if (dst->backend == GGML_BACKEND_CPU) {
1616
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1617
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1618
  }
@@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1621
  }
1622
  }
1623
 
1624
- if (src0->backend != GGML_BACKEND_GPU) {
1625
  ggml_cl_pool_free(d_X, x_size);
1626
  }
1627
- if (src1->backend != GGML_BACKEND_GPU) {
1628
  ggml_cl_pool_free(d_Y, y_size);
1629
  }
1630
- if (dst->backend != GGML_BACKEND_GPU) {
1631
  ggml_cl_pool_free(d_D, d_size);
1632
  }
1633
  }
@@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1670
  size_t y_size;
1671
  size_t d_size;
1672
  cl_mem d_X;
1673
- if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1674
  d_X = (cl_mem) src0->extra;
1675
  } else {
1676
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
@@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1687
  // TODO: copy src0 here when r3>1
1688
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1689
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1690
- if (src0->backend == GGML_BACKEND_GPU) {
1691
  x_offset = (i03 * ne02 + i02) * x_ne;
1692
  } else {
1693
  // copy src0 to device
@@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1741
  }
1742
 
1743
  // copy dst to host, then convert to float
1744
- if (dst->backend == GGML_BACKEND_CPU) {
1745
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1746
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1747
  ggml_fp16_to_fp32_row(tmp, d, d_ne);
@@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1753
  }
1754
  }
1755
 
1756
- if (src0->backend != GGML_BACKEND_GPU) {
1757
  ggml_cl_pool_free(d_X, x_size);
1758
  }
1759
  ggml_cl_pool_free(d_Y, y_size);
@@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1798
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1799
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1800
  cl_mem d_Q;
1801
- if (src0->backend == GGML_BACKEND_CPU) {
1802
  d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
1803
  }
1804
 
@@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1817
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1818
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1819
  // copy src0 to device if necessary
1820
- if (src0->backend == GGML_BACKEND_CPU) {
1821
  events.emplace_back();
1822
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1823
- } else if (src0->backend == GGML_BACKEND_GPU) {
1824
  d_Q = (cl_mem) src0->extra;
1825
  } else {
1826
  GGML_ASSERT(false);
@@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1829
  if (!mul_mat_vec) {
1830
  // convert src0 to fp32 on device
1831
  const size_t global = x_ne / global_denom;
1832
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1833
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1834
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1835
  CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
@@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1843
 
1844
  // compute
1845
  const size_t global = ne01 * local;
1846
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1847
  const cl_int ncols = ne00;
1848
  events.emplace_back();
1849
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1895
  }
1896
  ggml_cl_pool_free(d_Y, y_size);
1897
  ggml_cl_pool_free(d_D, d_size);
1898
- if (src0->backend == GGML_BACKEND_CPU) {
1899
  ggml_cl_pool_free(d_Q, q_size);
1900
  }
1901
  }
@@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
1911
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
1912
  src1->type == GGML_TYPE_F32 &&
1913
  dst->type == GGML_TYPE_F32 &&
1914
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
1915
  return true;
1916
  }
1917
 
@@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1993
  CL_CHECK(clFinish(queue));
1994
 
1995
  tensor->extra = dst;
1996
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1997
  }
1998
 
1999
  // ggml-backend
@@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
2045
  ctx->sub_buffers.push_back(sub_buffer);
2046
  tensor->extra = sub_buffer;
2047
  }
2048
- tensor->backend = GGML_BACKEND_GPU;
2049
  }
2050
 
2051
  static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 
1354
  }
1355
 
1356
  void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1357
+ if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
1358
  return;
1359
  }
1360
 
 
1412
  }
1413
 
1414
  static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1415
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
1416
  const int64_t ne00 = src0->ne[0];
1417
  const int64_t ne01 = src0->ne[1];
1418
  const int64_t ne02 = src0->ne[2];
 
1476
  }
1477
 
1478
  static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1479
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
1480
  const int64_t ne00 = src0->ne[0];
1481
  const int64_t ne01 = src0->ne[1];
1482
  const int64_t ne02 = src0->ne[2];
 
1566
  size_t y_size;
1567
  size_t d_size;
1568
  cl_mem d_X;
1569
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
1570
  d_X = (cl_mem) src0->extra;
1571
  } else {
1572
  d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1573
  }
1574
+ cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1575
+ cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1576
 
1577
  size_t x_offset = 0;
1578
 
 
1580
  // TODO: copy src0 here when r3>1
1581
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1582
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1583
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1584
  x_offset = (i03 * ne02 + i02) * x_ne;
1585
  } else {
1586
  // copy src0 to device
 
1589
 
1590
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1591
  // copy src1 to device
1592
+ if (src1->backend == GGML_BACKEND_TYPE_CPU) {
1593
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1594
  }
1595
 
 
1612
  }
1613
 
1614
  // copy dst to host
1615
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
1616
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1617
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1618
  }
 
1621
  }
1622
  }
1623
 
1624
+ if (src0->backend != GGML_BACKEND_TYPE_GPU) {
1625
  ggml_cl_pool_free(d_X, x_size);
1626
  }
1627
+ if (src1->backend != GGML_BACKEND_TYPE_GPU) {
1628
  ggml_cl_pool_free(d_Y, y_size);
1629
  }
1630
+ if (dst->backend != GGML_BACKEND_TYPE_GPU) {
1631
  ggml_cl_pool_free(d_D, d_size);
1632
  }
1633
  }
 
1670
  size_t y_size;
1671
  size_t d_size;
1672
  cl_mem d_X;
1673
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
1674
  d_X = (cl_mem) src0->extra;
1675
  } else {
1676
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
 
1687
  // TODO: copy src0 here when r3>1
1688
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1689
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1690
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1691
  x_offset = (i03 * ne02 + i02) * x_ne;
1692
  } else {
1693
  // copy src0 to device
 
1741
  }
1742
 
1743
  // copy dst to host, then convert to float
1744
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
1745
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1746
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1747
  ggml_fp16_to_fp32_row(tmp, d, d_ne);
 
1753
  }
1754
  }
1755
 
1756
+ if (src0->backend != GGML_BACKEND_TYPE_GPU) {
1757
  ggml_cl_pool_free(d_X, x_size);
1758
  }
1759
  ggml_cl_pool_free(d_Y, y_size);
 
1798
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1799
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1800
  cl_mem d_Q;
1801
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1802
  d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
1803
  }
1804
 
 
1817
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1818
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1819
  // copy src0 to device if necessary
1820
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1821
  events.emplace_back();
1822
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1823
+ } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1824
  d_Q = (cl_mem) src0->extra;
1825
  } else {
1826
  GGML_ASSERT(false);
 
1829
  if (!mul_mat_vec) {
1830
  // convert src0 to fp32 on device
1831
  const size_t global = x_ne / global_denom;
1832
+ const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1833
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1834
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1835
  CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
 
1843
 
1844
  // compute
1845
  const size_t global = ne01 * local;
1846
+ const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1847
  const cl_int ncols = ne00;
1848
  events.emplace_back();
1849
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
 
1895
  }
1896
  ggml_cl_pool_free(d_Y, y_size);
1897
  ggml_cl_pool_free(d_D, d_size);
1898
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1899
  ggml_cl_pool_free(d_Q, q_size);
1900
  }
1901
  }
 
1911
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
1912
  src1->type == GGML_TYPE_F32 &&
1913
  dst->type == GGML_TYPE_F32 &&
1914
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
1915
  return true;
1916
  }
1917
 
 
1993
  CL_CHECK(clFinish(queue));
1994
 
1995
  tensor->extra = dst;
1996
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
1997
  }
1998
 
1999
  // ggml-backend
 
2045
  ctx->sub_buffers.push_back(sub_buffer);
2046
  tensor->extra = sub_buffer;
2047
  }
2048
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
2049
  }
2050
 
2051
  static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml-sycl.cpp CHANGED
@@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
3338
 
3339
  size_t total_elements = ggml_nelements(src);
3340
 
3341
- const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
3342
  float *src_data =NULL;
3343
  if(src_on_device) {
3344
  ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
@@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
8086
  int ixj = col ^ j;
8087
  if (ixj > col) {
8088
  if ((col & k) == 0) {
8089
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
8090
  swap(dst_row[col], dst_row[ixj]);
8091
  }
8092
  } else {
8093
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
8094
  swap(dst_row[col], dst_row[ixj]);
8095
  }
8096
  }
@@ -10825,7 +10825,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10825
 
10826
  const sycl::range<3> block_dims(1, 1, ncols);
10827
  const sycl::range<3> block_nums(1, nrows, 1);
10828
- if (order == GGML_SORT_ASC) {
10829
  /*
10830
  DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
10831
  the limit. To get the device limit, query
@@ -10834,9 +10834,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10834
  stream->parallel_for(
10835
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10836
  [=](sycl::nd_item<3> item_ct1) {
10837
- k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
10838
  });
10839
- } else if (order == GGML_SORT_DESC) {
10840
  /*
10841
  DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
10842
  the limit. To get the device limit, query
@@ -10845,7 +10845,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10845
  stream->parallel_for(
10846
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10847
  [=](sycl::nd_item<3> item_ct1) {
10848
- k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
10849
  });
10850
  } else {
10851
  GGML_ASSERT(false);
@@ -11407,12 +11407,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
11407
 
11408
  dpct::memcpy_direction kind;
11409
  char * src_ptr;
11410
- if (src->backend == GGML_BACKEND_CPU) {
11411
  kind = dpct::host_to_device;
11412
  src_ptr = (char *) src->data;
11413
- // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
11414
- } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
11415
- GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
11416
  kind = dpct::device_to_device;
11417
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
11418
  int id;
@@ -11846,7 +11846,7 @@ inline void ggml_sycl_op_mul_mat_q(
11846
 
11847
  // the main device has a larger memory buffer to hold the results from all GPUs
11848
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
11849
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
11850
 
11851
  switch (src0->type) {
11852
  case GGML_TYPE_Q4_0:
@@ -12119,7 +12119,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
12119
 
12120
  // the main device has a larger memory buffer to hold the results from all GPUs
12121
  // ldc == nrows of the matrix that cuBLAS writes into
12122
- int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
12123
 
12124
  #ifdef GGML_SYCL_F16
12125
  bool use_fp16 = true; // TODO(Yu) SYCL capability check
@@ -12501,16 +12501,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
12501
  const bool use_src1 = src1 != nullptr;
12502
  const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
12503
 
12504
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
12505
- GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
12506
 
12507
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
12508
  ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
12509
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
12510
 
12511
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
12512
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
12513
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
12514
 
12515
  // dd = data device
12516
  float * src0_ddf = nullptr;
@@ -12565,7 +12565,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
12565
  main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
12566
  }
12567
 
12568
- if (dst->backend == GGML_BACKEND_CPU) {
12569
  SYCL_CHECK(CHECK_TRY_ERROR(
12570
  dpct::get_current_device().queues_wait_and_throw()));
12571
  }
@@ -12640,8 +12640,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12640
  const int nb2 = dst->nb[2];
12641
  const int nb3 = dst->nb[3];
12642
 
12643
- GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
12644
- GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
12645
 
12646
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
12647
 
@@ -12656,13 +12656,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12656
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
12657
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
12658
 
12659
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
12660
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
12661
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
12662
 
12663
  int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
12664
 
12665
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
12666
  GGML_ASSERT(!(split && ne02 > 1));
12667
  GGML_ASSERT(!(split && ne03 > 1));
12668
  GGML_ASSERT(!(split && ne02 < ne12));
@@ -12717,8 +12717,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12717
 
12718
  used_devices++;
12719
 
12720
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12721
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12722
 
12723
  ggml_sycl_set_device(get_device_id_by_index(id));
12724
  const dpct::queue_ptr stream = g_syclStreams[id][0];
@@ -12782,8 +12782,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12782
  continue;
12783
  }
12784
 
12785
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12786
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12787
  const int64_t row_diff = row_high[id] - row_low[id];
12788
 
12789
  ggml_sycl_set_device(get_device_id_by_index(id));
@@ -12809,12 +12809,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12809
 
12810
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
12811
  // in that case an offset on dst_ddf_i is needed
12812
- if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) {
12813
  dst_dd_i += row_low[id]; // offset is 0 if no tensor split
12814
  }
12815
 
12816
  // copy src0, src1 to device if necessary
12817
- if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
12818
  if (id != g_main_device_index) {
12819
  if (convert_src1_to_q8_1) {
12820
  char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
@@ -12830,14 +12830,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12830
  src1_ncols * ne10 * sizeof(float))));
12831
  }
12832
  }
12833
- } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
12834
  SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
12835
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
12836
  } else {
12837
  GGML_ASSERT(false);
12838
  }
12839
 
12840
- if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
12841
  quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
12842
  /*
12843
  DPCT1010:92: SYCL uses exceptions to report errors and does
@@ -12867,10 +12867,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12867
  if (!dst_on_device) {
12868
  void * dst_off_device;
12869
  dpct::memcpy_direction kind;
12870
- if (dst->backend == GGML_BACKEND_CPU) {
12871
  dst_off_device = dst->data;
12872
  kind = dpct::device_to_host;
12873
- } else if (dst->backend == GGML_BACKEND_GPU) {
12874
  dst_off_device = dst_extra->data_device[g_main_device_index];
12875
  kind = dpct::device_to_device;
12876
  } else {
@@ -12954,7 +12954,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12954
  }
12955
  }
12956
 
12957
- if (dst->backend == GGML_BACKEND_CPU) {
12958
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
12959
  SYCL_CHECK(CHECK_TRY_ERROR(
12960
  dpct::get_current_device().queues_wait_and_throw()));
@@ -13091,7 +13091,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
13091
  const ggml_tensor *src1,
13092
  ggml_tensor *dst) try {
13093
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
13094
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13095
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
13096
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
13097
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -13129,7 +13129,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
13129
  GGML_ASSERT(!ggml_is_transposed(src0));
13130
  GGML_ASSERT(!ggml_is_transposed(src1));
13131
  GGML_ASSERT(!ggml_is_permuted(src0));
13132
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13133
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13134
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13135
 
@@ -13196,7 +13196,7 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13196
  GGML_ASSERT(!ggml_is_transposed(src0));
13197
  GGML_ASSERT(!ggml_is_transposed(src1));
13198
 
13199
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13200
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13201
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13202
 
@@ -13372,11 +13372,11 @@ catch (sycl::exception const &exc) {
13372
 
13373
  static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
13374
  const bool all_on_device =
13375
- (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
13376
- (src1->backend == GGML_BACKEND_GPU) &&
13377
- ( dst->backend == GGML_BACKEND_GPU);
13378
 
13379
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
13380
 
13381
  int64_t min_compute_capability = INT_MAX;
13382
  for (int64_t id = 0; id < g_device_count; ++id) {
@@ -13505,7 +13505,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
13505
  GGML_ASSERT(!ggml_is_transposed(src00));
13506
  GGML_ASSERT(!ggml_is_transposed(src1));
13507
 
13508
- GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
13509
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13510
 
13511
  GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
@@ -13643,7 +13643,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13643
 
13644
  const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
13645
 
13646
- if (ids->backend == GGML_BACKEND_GPU) {
13647
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
13648
  SYCL_CHECK(CHECK_TRY_ERROR(
13649
  stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
@@ -13661,20 +13661,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13661
  ggml_tensor src1_row = *src1;
13662
  ggml_tensor dst_row = *dst;
13663
 
13664
- src1_row.backend = GGML_BACKEND_GPU;
13665
- dst_row.backend = GGML_BACKEND_GPU;
13666
 
13667
  src1_row.extra = &src1_row_extra;
13668
  dst_row.extra = &dst_row_extra;
13669
 
13670
- char * src1_original = src1->backend == GGML_BACKEND_CPU ?
13671
  (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
13672
- char * dst_original = dst->backend == GGML_BACKEND_CPU ?
13673
  (char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
13674
 
13675
  if (src1->ne[1] == 1) {
13676
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
13677
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
13678
 
13679
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
13680
  //int32_t row_id;
@@ -13756,7 +13756,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13756
  }
13757
  }
13758
 
13759
- if (dst->backend == GGML_BACKEND_CPU) {
13760
  SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
13761
  }
13762
  }
@@ -13779,8 +13779,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
13779
  const int64_t ne = ggml_nelements(src0);
13780
  GGML_ASSERT(ne == ggml_nelements(src1));
13781
 
13782
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
13783
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
13784
 
13785
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
13786
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
@@ -13887,17 +13887,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
13887
  memset(extra, 0, sizeof(*extra));
13888
 
13889
  for (int64_t id = 0; id < g_device_count; ++id) {
13890
- if (backend == GGML_BACKEND_GPU && id != g_main_device_index) {
13891
  continue;
13892
  }
13893
  ggml_sycl_set_device(get_device_id_by_index(id));
13894
  const dpct::queue_ptr stream = g_syclStreams[id][0];
13895
 
13896
  int64_t row_low, row_high;
13897
- if (backend == GGML_BACKEND_GPU) {
13898
  row_low = 0;
13899
  row_high = nrows;
13900
- } else if (backend == GGML_BACKEND_GPU_SPLIT) {
13901
  const int64_t rounding = get_row_rounding(tensor->type);
13902
 
13903
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
@@ -13946,7 +13946,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
13946
 
13947
  extra->data_device[id] = buf;
13948
 
13949
- if (backend == GGML_BACKEND_GPU_SPLIT) {
13950
  for (int64_t is = 0; is < MAX_STREAMS; ++is) {
13951
  SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
13952
  new sycl::event()));
@@ -13963,7 +13963,7 @@ catch (sycl::exception const &exc) {
13963
  }
13964
 
13965
  void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
13966
- if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
13967
  return;
13968
  }
13969
 
@@ -14016,15 +14016,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
14016
  return;
14017
  }
14018
 
14019
- tensor->backend = GGML_BACKEND_GPU;
14020
 
14021
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
14022
  const ggml_op src0_op = tensor->src[0]->op;
14023
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
14024
  ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
14025
  }
14026
  }
14027
- if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
14028
  ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
14029
  }
14030
 
@@ -14042,7 +14042,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
14042
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
14043
  const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
14044
 
14045
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
14046
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
14047
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
14048
  size_t offset = 0;
@@ -14111,7 +14111,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
14111
 
14112
  const bool inplace = tensor->view_src != nullptr;
14113
 
14114
- if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
14115
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
14116
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
14117
  size_t view_offset = 0;
@@ -14132,7 +14132,7 @@ catch (sycl::exception const &exc) {
14132
  }
14133
 
14134
  void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
14135
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14136
  GGML_ASSERT(ggml_is_contiguous(tensor));
14137
 
14138
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -14219,9 +14219,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
14219
  if (!g_sycl_loaded) return false;
14220
 
14221
  ggml_sycl_func_t func;
14222
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
14223
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
14224
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
14225
 
14226
  if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
14227
  return false;
@@ -14359,14 +14359,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
14359
  return false;
14360
  }
14361
 
14362
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
14363
  ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
14364
  }
14365
 
14366
  if (params->ith != 0) {
14367
  return true;
14368
  }
14369
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14370
  return true;
14371
  }
14372
  func(tensor->src[0], tensor->src[1], tensor);
@@ -14517,7 +14517,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
14517
 
14518
  extra->data_device[ctx->device] = tensor->data;
14519
 
14520
- tensor->backend = GGML_BACKEND_GPU;
14521
  tensor->extra = extra;
14522
 
14523
  if (ggml_is_quantized(tensor->type)) {
@@ -14548,7 +14548,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14548
  ggml_tensor *tensor,
14549
  const void *data, size_t offset,
14550
  size_t size) try {
14551
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14552
 
14553
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
14554
 
@@ -14573,7 +14573,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
14573
  const ggml_tensor *tensor,
14574
  void *data, size_t offset,
14575
  size_t size) try {
14576
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14577
 
14578
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
14579
 
@@ -14809,7 +14809,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
14809
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14810
 
14811
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14812
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14813
 
14814
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
14815
  (char *)tensor->data + offset, data, size)));
@@ -14827,7 +14827,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
14827
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14828
 
14829
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14830
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14831
 
14832
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
14833
  data, (const char *)tensor->data + offset, size)));
@@ -14880,7 +14880,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
14880
  ggml_sycl_set_main_device(sycl_ctx->device);
14881
 
14882
  ggml_compute_params params = {};
14883
- params.type = GGML_TASK_COMPUTE;
14884
  params.ith = 0;
14885
  for (int i = 0; i < cgraph->n_nodes; i++) {
14886
  ggml_tensor * node = cgraph->nodes[i];
@@ -14888,13 +14888,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
14888
  if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
14889
  continue;
14890
 
14891
- assert(node->backend == GGML_BACKEND_GPU);
14892
  assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
14893
  assert(node->extra != nullptr);
14894
 
14895
  for (int j = 0; j < GGML_MAX_SRC; j++) {
14896
  if (node->src[j] != nullptr) {
14897
- assert(node->src[j]->backend == GGML_BACKEND_GPU);
14898
  assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
14899
  assert(node->src[j]->extra != nullptr);
14900
  }
 
3338
 
3339
  size_t total_elements = ggml_nelements(src);
3340
 
3341
+ const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
3342
  float *src_data =NULL;
3343
  if(src_on_device) {
3344
  ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
 
8086
  int ixj = col ^ j;
8087
  if (ixj > col) {
8088
  if ((col & k) == 0) {
8089
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
8090
  swap(dst_row[col], dst_row[ixj]);
8091
  }
8092
  } else {
8093
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
8094
  swap(dst_row[col], dst_row[ixj]);
8095
  }
8096
  }
 
10825
 
10826
  const sycl::range<3> block_dims(1, 1, ncols);
10827
  const sycl::range<3> block_nums(1, nrows, 1);
10828
+ if (order == GGML_SORT_ORDER_ASC) {
10829
  /*
10830
  DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
10831
  the limit. To get the device limit, query
 
10834
  stream->parallel_for(
10835
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10836
  [=](sycl::nd_item<3> item_ct1) {
10837
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
10838
  });
10839
+ } else if (order == GGML_SORT_ORDER_DESC) {
10840
  /*
10841
  DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
10842
  the limit. To get the device limit, query
 
10845
  stream->parallel_for(
10846
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10847
  [=](sycl::nd_item<3> item_ct1) {
10848
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
10849
  });
10850
  } else {
10851
  GGML_ASSERT(false);
 
11407
 
11408
  dpct::memcpy_direction kind;
11409
  char * src_ptr;
11410
+ if (src->backend == GGML_BACKEND_TYPE_CPU) {
11411
  kind = dpct::host_to_device;
11412
  src_ptr = (char *) src->data;
11413
+ // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
11414
+ } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
11415
+ GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
11416
  kind = dpct::device_to_device;
11417
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
11418
  int id;
 
11846
 
11847
  // the main device has a larger memory buffer to hold the results from all GPUs
11848
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
11849
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
11850
 
11851
  switch (src0->type) {
11852
  case GGML_TYPE_Q4_0:
 
12119
 
12120
  // the main device has a larger memory buffer to hold the results from all GPUs
12121
  // ldc == nrows of the matrix that cuBLAS writes into
12122
+ int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
12123
 
12124
  #ifdef GGML_SYCL_F16
12125
  bool use_fp16 = true; // TODO(Yu) SYCL capability check
 
12501
  const bool use_src1 = src1 != nullptr;
12502
  const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
12503
 
12504
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12505
+ GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12506
 
12507
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
12508
  ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
12509
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
12510
 
12511
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12512
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
12513
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
12514
 
12515
  // dd = data device
12516
  float * src0_ddf = nullptr;
 
12565
  main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
12566
  }
12567
 
12568
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12569
  SYCL_CHECK(CHECK_TRY_ERROR(
12570
  dpct::get_current_device().queues_wait_and_throw()));
12571
  }
 
12640
  const int nb2 = dst->nb[2];
12641
  const int nb3 = dst->nb[3];
12642
 
12643
+ GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12644
+ GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12645
 
12646
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
12647
 
 
12656
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
12657
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
12658
 
12659
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12660
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
12661
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
12662
 
12663
  int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
12664
 
12665
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12666
  GGML_ASSERT(!(split && ne02 > 1));
12667
  GGML_ASSERT(!(split && ne03 > 1));
12668
  GGML_ASSERT(!(split && ne02 < ne12));
 
12717
 
12718
  used_devices++;
12719
 
12720
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12721
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12722
 
12723
  ggml_sycl_set_device(get_device_id_by_index(id));
12724
  const dpct::queue_ptr stream = g_syclStreams[id][0];
 
12782
  continue;
12783
  }
12784
 
12785
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12786
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12787
  const int64_t row_diff = row_high[id] - row_low[id];
12788
 
12789
  ggml_sycl_set_device(get_device_id_by_index(id));
 
12809
 
12810
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
12811
  // in that case an offset on dst_ddf_i is needed
12812
+ if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
12813
  dst_dd_i += row_low[id]; // offset is 0 if no tensor split
12814
  }
12815
 
12816
  // copy src0, src1 to device if necessary
12817
+ if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
12818
  if (id != g_main_device_index) {
12819
  if (convert_src1_to_q8_1) {
12820
  char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
 
12830
  src1_ncols * ne10 * sizeof(float))));
12831
  }
12832
  }
12833
+ } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
12834
  SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
12835
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
12836
  } else {
12837
  GGML_ASSERT(false);
12838
  }
12839
 
12840
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
12841
  quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
12842
  /*
12843
  DPCT1010:92: SYCL uses exceptions to report errors and does
 
12867
  if (!dst_on_device) {
12868
  void * dst_off_device;
12869
  dpct::memcpy_direction kind;
12870
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12871
  dst_off_device = dst->data;
12872
  kind = dpct::device_to_host;
12873
+ } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
12874
  dst_off_device = dst_extra->data_device[g_main_device_index];
12875
  kind = dpct::device_to_device;
12876
  } else {
 
12954
  }
12955
  }
12956
 
12957
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12958
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
12959
  SYCL_CHECK(CHECK_TRY_ERROR(
12960
  dpct::get_current_device().queues_wait_and_throw()));
 
13091
  const ggml_tensor *src1,
13092
  ggml_tensor *dst) try {
13093
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
13094
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13095
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
13096
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
13097
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
 
13129
  GGML_ASSERT(!ggml_is_transposed(src0));
13130
  GGML_ASSERT(!ggml_is_transposed(src1));
13131
  GGML_ASSERT(!ggml_is_permuted(src0));
13132
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13133
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13134
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13135
 
 
13196
  GGML_ASSERT(!ggml_is_transposed(src0));
13197
  GGML_ASSERT(!ggml_is_transposed(src1));
13198
 
13199
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13200
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13201
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13202
 
 
13372
 
13373
  static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
13374
  const bool all_on_device =
13375
+ (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
13376
+ (src1->backend == GGML_BACKEND_TYPE_GPU) &&
13377
+ ( dst->backend == GGML_BACKEND_TYPE_GPU);
13378
 
13379
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
13380
 
13381
  int64_t min_compute_capability = INT_MAX;
13382
  for (int64_t id = 0; id < g_device_count; ++id) {
 
13505
  GGML_ASSERT(!ggml_is_transposed(src00));
13506
  GGML_ASSERT(!ggml_is_transposed(src1));
13507
 
13508
+ GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13509
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13510
 
13511
  GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
 
13643
 
13644
  const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
13645
 
13646
+ if (ids->backend == GGML_BACKEND_TYPE_GPU) {
13647
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
13648
  SYCL_CHECK(CHECK_TRY_ERROR(
13649
  stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
 
13661
  ggml_tensor src1_row = *src1;
13662
  ggml_tensor dst_row = *dst;
13663
 
13664
+ src1_row.backend = GGML_BACKEND_TYPE_GPU;
13665
+ dst_row.backend = GGML_BACKEND_TYPE_GPU;
13666
 
13667
  src1_row.extra = &src1_row_extra;
13668
  dst_row.extra = &dst_row_extra;
13669
 
13670
+ char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
13671
  (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
13672
+ char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
13673
  (char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
13674
 
13675
  if (src1->ne[1] == 1) {
13676
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
13677
+ GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
13678
 
13679
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
13680
  //int32_t row_id;
 
13756
  }
13757
  }
13758
 
13759
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
13760
  SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
13761
  }
13762
  }
 
13779
  const int64_t ne = ggml_nelements(src0);
13780
  GGML_ASSERT(ne == ggml_nelements(src1));
13781
 
13782
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
13783
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
13784
 
13785
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
13786
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
 
13887
  memset(extra, 0, sizeof(*extra));
13888
 
13889
  for (int64_t id = 0; id < g_device_count; ++id) {
13890
+ if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
13891
  continue;
13892
  }
13893
  ggml_sycl_set_device(get_device_id_by_index(id));
13894
  const dpct::queue_ptr stream = g_syclStreams[id][0];
13895
 
13896
  int64_t row_low, row_high;
13897
+ if (backend == GGML_BACKEND_TYPE_GPU) {
13898
  row_low = 0;
13899
  row_high = nrows;
13900
+ } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
13901
  const int64_t rounding = get_row_rounding(tensor->type);
13902
 
13903
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
 
13946
 
13947
  extra->data_device[id] = buf;
13948
 
13949
+ if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
13950
  for (int64_t is = 0; is < MAX_STREAMS; ++is) {
13951
  SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
13952
  new sycl::event()));
 
13963
  }
13964
 
13965
  void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
13966
+ if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
13967
  return;
13968
  }
13969
 
 
14016
  return;
14017
  }
14018
 
14019
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
14020
 
14021
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
14022
  const ggml_op src0_op = tensor->src[0]->op;
14023
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
14024
  ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
14025
  }
14026
  }
14027
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
14028
  ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
14029
  }
14030
 
 
14042
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
14043
  const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
14044
 
14045
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
14046
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
14047
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
14048
  size_t offset = 0;
 
14111
 
14112
  const bool inplace = tensor->view_src != nullptr;
14113
 
14114
+ if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
14115
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
14116
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
14117
  size_t view_offset = 0;
 
14132
  }
14133
 
14134
  void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
14135
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14136
  GGML_ASSERT(ggml_is_contiguous(tensor));
14137
 
14138
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
 
14219
  if (!g_sycl_loaded) return false;
14220
 
14221
  ggml_sycl_func_t func;
14222
+ const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
14223
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
14224
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
14225
 
14226
  if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
14227
  return false;
 
14359
  return false;
14360
  }
14361
 
14362
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
14363
  ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
14364
  }
14365
 
14366
  if (params->ith != 0) {
14367
  return true;
14368
  }
14369
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14370
  return true;
14371
  }
14372
  func(tensor->src[0], tensor->src[1], tensor);
 
14517
 
14518
  extra->data_device[ctx->device] = tensor->data;
14519
 
14520
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
14521
  tensor->extra = extra;
14522
 
14523
  if (ggml_is_quantized(tensor->type)) {
 
14548
  ggml_tensor *tensor,
14549
  const void *data, size_t offset,
14550
  size_t size) try {
14551
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14552
 
14553
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
14554
 
 
14573
  const ggml_tensor *tensor,
14574
  void *data, size_t offset,
14575
  size_t size) try {
14576
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14577
 
14578
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
14579
 
 
14809
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14810
 
14811
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14812
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14813
 
14814
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
14815
  (char *)tensor->data + offset, data, size)));
 
14827
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14828
 
14829
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14830
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14831
 
14832
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
14833
  data, (const char *)tensor->data + offset, size)));
 
14880
  ggml_sycl_set_main_device(sycl_ctx->device);
14881
 
14882
  ggml_compute_params params = {};
14883
+ params.type = GGML_TASK_TYPE_COMPUTE;
14884
  params.ith = 0;
14885
  for (int i = 0; i < cgraph->n_nodes; i++) {
14886
  ggml_tensor * node = cgraph->nodes[i];
 
14888
  if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
14889
  continue;
14890
 
14891
+ assert(node->backend == GGML_BACKEND_TYPE_GPU);
14892
  assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
14893
  assert(node->extra != nullptr);
14894
 
14895
  for (int j = 0; j < GGML_MAX_SRC; j++) {
14896
  if (node->src[j] != nullptr) {
14897
+ assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
14898
  assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
14899
  assert(node->src[j]->extra != nullptr);
14900
  }
ggml-vulkan.cpp CHANGED
@@ -2320,8 +2320,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2320
  src1_uma = d_Qy != nullptr;
2321
  }
2322
 
2323
- const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
2324
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2325
 
2326
  const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2327
  const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
@@ -2453,7 +2453,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2453
  // compute
2454
  ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
2455
 
2456
- if (dst->backend == GGML_BACKEND_CPU) {
2457
  // copy dst to host
2458
  float * d = (float *) ((char *) dst->data);
2459
  ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
@@ -2506,8 +2506,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2506
  src1_uma = d_Qy != nullptr;
2507
  }
2508
 
2509
- const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
2510
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2511
 
2512
  const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2513
  const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
@@ -2630,7 +2630,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2630
  ggml_vk_sync_buffers(subctx);
2631
  ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
2632
 
2633
- if (dst->backend == GGML_BACKEND_CPU) {
2634
  // copy dst to host
2635
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
2636
  ggml_vk_sync_buffers(subctx);
@@ -2647,7 +2647,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2647
  std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2648
  #endif
2649
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
2650
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2651
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
2652
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
2653
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -2679,7 +2679,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2679
  src1_uma = d_Qy != nullptr;
2680
  }
2681
 
2682
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2683
 
2684
  const uint64_t x_ne = ne00 * ne01 * ne02;
2685
  const uint64_t y_ne = ne10 * ne11 * ne12;
@@ -2721,7 +2721,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2721
  ggml_vk_sync_buffers(subctx);
2722
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
2723
 
2724
- if (dst->backend == GGML_BACKEND_CPU) {
2725
  // copy dst to host
2726
  float * d = (float *) dst->data;
2727
  ggml_vk_sync_buffers(subctx);
@@ -2738,7 +2738,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2738
  GGML_ASSERT(!ggml_is_transposed(src0));
2739
  GGML_ASSERT(!ggml_is_transposed(src1));
2740
  GGML_ASSERT(!ggml_is_permuted(src0));
2741
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2742
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
2743
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
2744
 
@@ -2771,7 +2771,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2771
  src1_uma = d_Qy != nullptr;
2772
  }
2773
 
2774
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2775
 
2776
  const uint64_t d_ne = ne01 * ne11 * ne12;
2777
 
@@ -2814,7 +2814,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2814
  ggml_vk_sync_buffers(subctx);
2815
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
2816
 
2817
- if (dst->backend == GGML_BACKEND_CPU) {
2818
  // copy dst to host
2819
  float * d = (float *) dst->data;
2820
  ggml_vk_sync_buffers(subctx);
@@ -2832,7 +2832,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
2832
  return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2833
  (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
2834
  dst->type == GGML_TYPE_F32 &&
2835
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU);
2836
  }
2837
 
2838
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@@ -2880,8 +2880,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
2880
  // TODO: support for transposed / permuted tensors
2881
  GGML_ASSERT(nb0 == sizeof(float));
2882
  GGML_ASSERT(nb00 == sizeof(float));
2883
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2884
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
2885
 
2886
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
2887
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
@@ -3110,8 +3110,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3110
  }
3111
  }
3112
 
3113
- const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
3114
- const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
3115
 
3116
  uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
3117
  uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
@@ -3120,7 +3120,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3120
  vk_buffer d_D = extra->buffer_gpu.lock();
3121
 
3122
  // Workaround for tiny tensor inputs on ROPE
3123
- if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) {
3124
  y_sz = VK_WHOLE_SIZE;
3125
  }
3126
 
@@ -3209,9 +3209,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3209
  ggml_vk_sync_buffers(subctx);
3210
  ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
3211
  }
3212
- if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) {
3213
  ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
3214
- } else if(dst->backend == GGML_BACKEND_CPU) {
3215
  // copy dst to host
3216
  float * d = (float *) dst->data;
3217
  ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
@@ -3253,7 +3253,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3253
  ggml_vk_sync_buffers(subctx);
3254
  ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
3255
  }
3256
- if (dst->backend == GGML_BACKEND_CPU) {
3257
  // copy dst to host
3258
  ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
3259
  }
@@ -3359,7 +3359,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
3359
 
3360
  static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
3361
  // If backend is CPU, data from src0 has to be copied off the device
3362
- if (dst->backend == GGML_BACKEND_CPU) {
3363
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3364
  vk_buffer d_D = extra_src0->buffer_gpu.lock();
3365
  ggml_vk_sync_buffers(subctx);
@@ -3994,9 +3994,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
3994
  #ifdef GGML_VULKAN_DEBUG
3995
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
3996
  #endif
3997
- const bool any_on_device = node->backend == GGML_BACKEND_GPU
3998
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
3999
- || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU));
4000
 
4001
  if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
4002
  return;
@@ -4215,9 +4215,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4215
  }
4216
 
4217
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
4218
- const bool any_on_device = node->backend == GGML_BACKEND_GPU
4219
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
4220
- || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU);
4221
 
4222
  if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
4223
  return;
@@ -4371,7 +4371,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4371
  last_node = true;
4372
  #endif
4373
 
4374
- if (node->backend == GGML_BACKEND_CPU || last_node) {
4375
  ggml_vk_ctx_end(ctx->compute_ctx);
4376
  ctx->compute_ctx->exit_tensor = node;
4377
  ctx->compute_ctx = nullptr;
@@ -4379,9 +4379,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4379
  }
4380
 
4381
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
4382
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
4383
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
4384
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
4385
 
4386
  if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
4387
  return false;
@@ -4442,7 +4442,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
4442
  if (params->ith != 0) {
4443
  return true;
4444
  }
4445
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
4446
  return true;
4447
  }
4448
 
@@ -4745,7 +4745,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
4745
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
4746
  }
4747
 
4748
- tensor->backend = GGML_BACKEND_GPU;
4749
  tensor->extra = extra;
4750
  }
4751
 
@@ -4753,7 +4753,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
4753
  #ifdef GGML_VULKAN_DEBUG
4754
  std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
4755
  #endif
4756
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
4757
 
4758
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
4759
 
@@ -4768,7 +4768,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
4768
  #ifdef GGML_VULKAN_DEBUG
4769
  std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
4770
  #endif
4771
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
4772
 
4773
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
4774
 
@@ -4999,7 +4999,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
4999
  #endif
5000
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
5001
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
5002
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
5003
 
5004
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
5005
 
@@ -5020,7 +5020,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
5020
  #endif
5021
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
5022
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
5023
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
5024
 
5025
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
5026
 
@@ -5097,7 +5097,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
5097
  int last_node = cgraph->n_nodes - 1;
5098
 
5099
  // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
5100
- while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
5101
  last_node -= 1;
5102
  }
5103
 
@@ -5106,7 +5106,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
5106
  }
5107
 
5108
  ggml_compute_params params = {};
5109
- params.type = GGML_TASK_COMPUTE;
5110
  params.ith = 0;
5111
  for (int i = 0; i < cgraph->n_nodes; i++) {
5112
  ggml_tensor * node = cgraph->nodes[i];
@@ -5416,7 +5416,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
5416
  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
5417
  void * tensor_data = tensor->data;
5418
 
5419
- if (tensor->backend == GGML_BACKEND_GPU) {
5420
  const size_t tensor_size = ggml_nbytes(tensor);
5421
  tensor_data = malloc(tensor_size);
5422
 
@@ -5442,14 +5442,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
5442
  std::vector<const ggml_tensor *> done;
5443
  ggml_vk_print_graph_origin(tensor, done);
5444
 
5445
- if (tensor->backend == GGML_BACKEND_GPU) {
5446
  free(tensor_data);
5447
  }
5448
  }
5449
 
5450
  static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
5451
  return;
5452
- GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU);
5453
  if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
5454
  return;
5455
  }
@@ -5487,7 +5487,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5487
  if (params->ith != 0) {
5488
  return;
5489
  }
5490
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5491
  return;
5492
  }
5493
 
@@ -5524,10 +5524,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5524
 
5525
  src0_buffer = malloc(src0_size);
5526
  src0_clone->data = src0_buffer;
5527
- if (src0->backend == GGML_BACKEND_CPU) {
5528
  memcpy(src0_clone->data, src0->data, src0_size);
5529
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
5530
- } else if (src0->backend == GGML_BACKEND_GPU) {
5531
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
5532
  uint64_t offset = extra->offset;
5533
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
@@ -5567,10 +5567,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5567
 
5568
  src1_buffer = malloc(src1_size);
5569
  src1_clone->data = src1_buffer;
5570
- if (src1->backend == GGML_BACKEND_CPU) {
5571
  memcpy(src1_clone->data, src1->data, src1_size);
5572
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
5573
- } else if (src1->backend == GGML_BACKEND_GPU) {
5574
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
5575
  uint64_t offset = extra->offset;
5576
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
@@ -5729,7 +5729,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
5729
  if (params->ith != 0) {
5730
  return;
5731
  }
5732
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5733
  return;
5734
  }
5735
  if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
@@ -5741,7 +5741,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
5741
 
5742
  void * tensor_data = tensor->data;
5743
 
5744
- if (tensor->backend == GGML_BACKEND_GPU) {
5745
  size_t tensor_size = ggml_nbytes(tensor);
5746
  tensor_data = malloc(tensor_size);
5747
 
@@ -5874,7 +5874,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
5874
  comp_result = nullptr;
5875
  comp_size = 0;
5876
 
5877
- if (tensor->backend == GGML_BACKEND_GPU) {
5878
  free(tensor_data);
5879
  }
5880
  }
 
2320
  src1_uma = d_Qy != nullptr;
2321
  }
2322
 
2323
+ const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2324
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2325
 
2326
  const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2327
  const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
 
2453
  // compute
2454
  ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
2455
 
2456
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2457
  // copy dst to host
2458
  float * d = (float *) ((char *) dst->data);
2459
  ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
 
2506
  src1_uma = d_Qy != nullptr;
2507
  }
2508
 
2509
+ const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2510
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2511
 
2512
  const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2513
  const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
 
2630
  ggml_vk_sync_buffers(subctx);
2631
  ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
2632
 
2633
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2634
  // copy dst to host
2635
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
2636
  ggml_vk_sync_buffers(subctx);
 
2647
  std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2648
  #endif
2649
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
2650
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
2651
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
2652
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
2653
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
 
2679
  src1_uma = d_Qy != nullptr;
2680
  }
2681
 
2682
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2683
 
2684
  const uint64_t x_ne = ne00 * ne01 * ne02;
2685
  const uint64_t y_ne = ne10 * ne11 * ne12;
 
2721
  ggml_vk_sync_buffers(subctx);
2722
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
2723
 
2724
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2725
  // copy dst to host
2726
  float * d = (float *) dst->data;
2727
  ggml_vk_sync_buffers(subctx);
 
2738
  GGML_ASSERT(!ggml_is_transposed(src0));
2739
  GGML_ASSERT(!ggml_is_transposed(src1));
2740
  GGML_ASSERT(!ggml_is_permuted(src0));
2741
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
2742
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
2743
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
2744
 
 
2771
  src1_uma = d_Qy != nullptr;
2772
  }
2773
 
2774
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2775
 
2776
  const uint64_t d_ne = ne01 * ne11 * ne12;
2777
 
 
2814
  ggml_vk_sync_buffers(subctx);
2815
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
2816
 
2817
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2818
  // copy dst to host
2819
  float * d = (float *) dst->data;
2820
  ggml_vk_sync_buffers(subctx);
 
2832
  return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2833
  (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
2834
  dst->type == GGML_TYPE_F32 &&
2835
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
2836
  }
2837
 
2838
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
 
2880
  // TODO: support for transposed / permuted tensors
2881
  GGML_ASSERT(nb0 == sizeof(float));
2882
  GGML_ASSERT(nb00 == sizeof(float));
2883
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
2884
+ GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
2885
 
2886
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
2887
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
 
3110
  }
3111
  }
3112
 
3113
+ const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
3114
+ const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
3115
 
3116
  uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
3117
  uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
 
3120
  vk_buffer d_D = extra->buffer_gpu.lock();
3121
 
3122
  // Workaround for tiny tensor inputs on ROPE
3123
+ if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
3124
  y_sz = VK_WHOLE_SIZE;
3125
  }
3126
 
 
3209
  ggml_vk_sync_buffers(subctx);
3210
  ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
3211
  }
3212
+ if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
3213
  ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
3214
+ } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
3215
  // copy dst to host
3216
  float * d = (float *) dst->data;
3217
  ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
 
3253
  ggml_vk_sync_buffers(subctx);
3254
  ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
3255
  }
3256
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3257
  // copy dst to host
3258
  ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
3259
  }
 
3359
 
3360
  static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
3361
  // If backend is CPU, data from src0 has to be copied off the device
3362
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3363
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3364
  vk_buffer d_D = extra_src0->buffer_gpu.lock();
3365
  ggml_vk_sync_buffers(subctx);
 
3994
  #ifdef GGML_VULKAN_DEBUG
3995
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
3996
  #endif
3997
+ const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
3998
+ || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
3999
+ || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
4000
 
4001
  if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
4002
  return;
 
4215
  }
4216
 
4217
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
4218
+ const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4219
+ || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4220
+ || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4221
 
4222
  if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
4223
  return;
 
4371
  last_node = true;
4372
  #endif
4373
 
4374
+ if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
4375
  ggml_vk_ctx_end(ctx->compute_ctx);
4376
  ctx->compute_ctx->exit_tensor = node;
4377
  ctx->compute_ctx = nullptr;
 
4379
  }
4380
 
4381
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
4382
+ const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
4383
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4384
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4385
 
4386
  if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
4387
  return false;
 
4442
  if (params->ith != 0) {
4443
  return true;
4444
  }
4445
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
4446
  return true;
4447
  }
4448
 
 
4745
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
4746
  }
4747
 
4748
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
4749
  tensor->extra = extra;
4750
  }
4751
 
 
4753
  #ifdef GGML_VULKAN_DEBUG
4754
  std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
4755
  #endif
4756
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
4757
 
4758
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
4759
 
 
4768
  #ifdef GGML_VULKAN_DEBUG
4769
  std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
4770
  #endif
4771
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
4772
 
4773
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
4774
 
 
4999
  #endif
5000
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
5001
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
5002
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5003
 
5004
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
5005
 
 
5020
  #endif
5021
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
5022
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
5023
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5024
 
5025
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
5026
 
 
5097
  int last_node = cgraph->n_nodes - 1;
5098
 
5099
  // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
5100
+ while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
5101
  last_node -= 1;
5102
  }
5103
 
 
5106
  }
5107
 
5108
  ggml_compute_params params = {};
5109
+ params.type = GGML_TASK_TYPE_COMPUTE;
5110
  params.ith = 0;
5111
  for (int i = 0; i < cgraph->n_nodes; i++) {
5112
  ggml_tensor * node = cgraph->nodes[i];
 
5416
  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
5417
  void * tensor_data = tensor->data;
5418
 
5419
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5420
  const size_t tensor_size = ggml_nbytes(tensor);
5421
  tensor_data = malloc(tensor_size);
5422
 
 
5442
  std::vector<const ggml_tensor *> done;
5443
  ggml_vk_print_graph_origin(tensor, done);
5444
 
5445
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5446
  free(tensor_data);
5447
  }
5448
  }
5449
 
5450
  static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
5451
  return;
5452
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
5453
  if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
5454
  return;
5455
  }
 
5487
  if (params->ith != 0) {
5488
  return;
5489
  }
5490
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5491
  return;
5492
  }
5493
 
 
5524
 
5525
  src0_buffer = malloc(src0_size);
5526
  src0_clone->data = src0_buffer;
5527
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
5528
  memcpy(src0_clone->data, src0->data, src0_size);
5529
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
5530
+ } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
5531
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
5532
  uint64_t offset = extra->offset;
5533
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
 
5567
 
5568
  src1_buffer = malloc(src1_size);
5569
  src1_clone->data = src1_buffer;
5570
+ if (src1->backend == GGML_BACKEND_TYPE_CPU) {
5571
  memcpy(src1_clone->data, src1->data, src1_size);
5572
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
5573
+ } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
5574
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
5575
  uint64_t offset = extra->offset;
5576
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
 
5729
  if (params->ith != 0) {
5730
  return;
5731
  }
5732
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5733
  return;
5734
  }
5735
  if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
 
5741
 
5742
  void * tensor_data = tensor->data;
5743
 
5744
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5745
  size_t tensor_size = ggml_nbytes(tensor);
5746
  tensor_data = malloc(tensor_size);
5747
 
 
5874
  comp_result = nullptr;
5875
  comp_size = 0;
5876
 
5877
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5878
  free(tensor_data);
5879
  }
5880
  }
ggml.c CHANGED
@@ -2725,7 +2725,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2725
  }
2726
  }
2727
 
2728
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
2729
 
2730
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
2731
 
@@ -2733,7 +2733,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2733
 
2734
  *result = (struct ggml_tensor) {
2735
  /*.type =*/ type,
2736
- /*.backend =*/ GGML_BACKEND_CPU,
2737
  /*.buffer =*/ NULL,
2738
  /*.ne =*/ { 1, 1, 1, 1 },
2739
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -3306,7 +3306,7 @@ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
3306
  char * const mem_buffer = ctx->mem_buffer;
3307
 
3308
  while (obj != NULL) {
3309
- if (obj->type == GGML_OBJECT_TENSOR) {
3310
  return (struct ggml_tensor *)(mem_buffer + obj->offs);
3311
  }
3312
 
@@ -3323,7 +3323,7 @@ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struc
3323
  char * const mem_buffer = ctx->mem_buffer;
3324
 
3325
  while (obj != NULL) {
3326
- if (obj->type == GGML_OBJECT_TENSOR) {
3327
  return (struct ggml_tensor *)(mem_buffer + obj->offs);
3328
  }
3329
 
@@ -3339,7 +3339,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
3339
  char * const mem_buffer = ctx->mem_buffer;
3340
 
3341
  while (obj != NULL) {
3342
- if (obj->type == GGML_OBJECT_TENSOR) {
3343
  struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
3344
  if (strcmp(cur->name, name) == 0) {
3345
  return cur;
@@ -5883,7 +5883,7 @@ struct ggml_tensor * ggml_top_k(
5883
  int k) {
5884
  GGML_ASSERT(a->ne[0] >= k);
5885
 
5886
- struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
5887
 
5888
  result = ggml_view_4d(ctx, result,
5889
  k, result->ne[1], result->ne[2], result->ne[3],
@@ -6677,7 +6677,7 @@ static void ggml_compute_forward_dup_same_cont(
6677
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
6678
  GGML_ASSERT(src0->type == dst->type);
6679
 
6680
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6681
  return;
6682
  }
6683
 
@@ -6709,7 +6709,7 @@ static void ggml_compute_forward_dup_f16(
6709
 
6710
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6711
 
6712
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6713
  return;
6714
  }
6715
 
@@ -6982,7 +6982,7 @@ static void ggml_compute_forward_dup_f32(
6982
 
6983
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6984
 
6985
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6986
  return;
6987
  }
6988
 
@@ -7235,7 +7235,7 @@ static void ggml_compute_forward_dup_bytes(
7235
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
7236
  GGML_ASSERT(src0->type == dst->type);
7237
 
7238
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7239
  return;
7240
  }
7241
 
@@ -7415,7 +7415,7 @@ static void ggml_compute_forward_add_f32(
7415
 
7416
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7417
 
7418
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7419
  return;
7420
  }
7421
 
@@ -7423,7 +7423,7 @@ static void ggml_compute_forward_add_f32(
7423
  const int nth = params->nth;
7424
 
7425
  #ifdef GGML_USE_CLBLAST
7426
- if (src1->backend == GGML_BACKEND_GPU) {
7427
  // TODO: OpenCL kernel support full broadcast
7428
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7429
  if (ith == 0) {
@@ -7505,7 +7505,7 @@ static void ggml_compute_forward_add_f16_f32(
7505
 
7506
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7507
 
7508
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7509
  return;
7510
  }
7511
 
@@ -7584,7 +7584,7 @@ static void ggml_compute_forward_add_f16_f16(
7584
 
7585
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7586
 
7587
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7588
  return;
7589
  }
7590
 
@@ -7640,7 +7640,7 @@ static void ggml_compute_forward_add_q_f32(
7640
 
7641
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7642
 
7643
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7644
  return;
7645
  }
7646
 
@@ -7778,7 +7778,7 @@ static void ggml_compute_forward_add1_f32(
7778
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7779
  GGML_ASSERT(ggml_is_scalar(src1));
7780
 
7781
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7782
  return;
7783
  }
7784
 
@@ -7832,7 +7832,7 @@ static void ggml_compute_forward_add1_f16_f32(
7832
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7833
  GGML_ASSERT(ggml_is_scalar(src1));
7834
 
7835
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7836
  return;
7837
  }
7838
 
@@ -7884,7 +7884,7 @@ static void ggml_compute_forward_add1_f16_f16(
7884
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7885
  GGML_ASSERT(ggml_is_scalar(src1));
7886
 
7887
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7888
  return;
7889
  }
7890
 
@@ -7936,7 +7936,7 @@ static void ggml_compute_forward_add1_q_f32(
7936
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7937
  GGML_ASSERT(ggml_is_scalar(src1));
7938
 
7939
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7940
  return;
7941
  }
7942
 
@@ -8066,7 +8066,7 @@ static void ggml_compute_forward_acc_f32(
8066
  size_t offset = ((int32_t *) dst->op_params)[3];
8067
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
8068
 
8069
- if (!inplace && (params->type == GGML_TASK_INIT)) {
8070
  if (params->ith != 0) {
8071
  return;
8072
  }
@@ -8078,7 +8078,7 @@ static void ggml_compute_forward_acc_f32(
8078
  ggml_nbytes(dst));
8079
  }
8080
 
8081
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8082
  return;
8083
  }
8084
 
@@ -8180,7 +8180,7 @@ static void ggml_compute_forward_sub_f32(
8180
  assert(params->ith == 0);
8181
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
8182
 
8183
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8184
  return;
8185
  }
8186
 
@@ -8261,14 +8261,14 @@ static void ggml_compute_forward_mul_f32(
8261
 
8262
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8263
 
8264
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8265
  return;
8266
  }
8267
  const int ith = params->ith;
8268
  const int nth = params->nth;
8269
 
8270
  #if defined(GGML_USE_CLBLAST)
8271
- if (src1->backend == GGML_BACKEND_GPU) {
8272
  // TODO: OpenCL kernel support full broadcast
8273
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
8274
  if (ith == 0) {
@@ -8369,7 +8369,7 @@ static void ggml_compute_forward_div_f32(
8369
 
8370
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8371
 
8372
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8373
  return;
8374
  }
8375
 
@@ -8464,7 +8464,7 @@ static void ggml_compute_forward_sqr_f32(
8464
  assert(params->ith == 0);
8465
  assert(ggml_are_same_shape(src0, dst));
8466
 
8467
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8468
  return;
8469
  }
8470
 
@@ -8510,7 +8510,7 @@ static void ggml_compute_forward_sqrt_f32(
8510
  assert(params->ith == 0);
8511
  assert(ggml_are_same_shape(src0, dst));
8512
 
8513
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8514
  return;
8515
  }
8516
 
@@ -8556,7 +8556,7 @@ static void ggml_compute_forward_log_f32(
8556
  GGML_ASSERT(params->ith == 0);
8557
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8558
 
8559
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8560
  return;
8561
  }
8562
 
@@ -8602,7 +8602,7 @@ static void ggml_compute_forward_sum_f32(
8602
  assert(params->ith == 0);
8603
  assert(ggml_is_scalar(dst));
8604
 
8605
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8606
  return;
8607
  }
8608
 
@@ -8637,7 +8637,7 @@ static void ggml_compute_forward_sum_f16(
8637
  assert(params->ith == 0);
8638
  assert(ggml_is_scalar(dst));
8639
 
8640
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8641
  return;
8642
  }
8643
 
@@ -8694,7 +8694,7 @@ static void ggml_compute_forward_sum_rows_f32(
8694
 
8695
  GGML_ASSERT(params->ith == 0);
8696
 
8697
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8698
  return;
8699
  }
8700
 
@@ -8749,7 +8749,7 @@ static void ggml_compute_forward_mean_f32(
8749
 
8750
  assert(params->ith == 0);
8751
 
8752
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8753
  return;
8754
  }
8755
 
@@ -8808,7 +8808,7 @@ static void ggml_compute_forward_argmax_f32(
8808
 
8809
  assert(params->ith == 0);
8810
 
8811
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8812
  return;
8813
  }
8814
 
@@ -8859,7 +8859,7 @@ static void ggml_compute_forward_repeat_f32(
8859
  GGML_ASSERT(params->ith == 0);
8860
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8861
 
8862
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8863
  return;
8864
  }
8865
 
@@ -8904,7 +8904,7 @@ static void ggml_compute_forward_repeat_f16(
8904
  GGML_ASSERT(params->ith == 0);
8905
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8906
 
8907
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8908
  return;
8909
  }
8910
 
@@ -8978,7 +8978,7 @@ static void ggml_compute_forward_repeat_back_f32(
8978
  GGML_ASSERT(params->ith == 0);
8979
  GGML_ASSERT(ggml_can_repeat(dst, src0));
8980
 
8981
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8982
  return;
8983
  }
8984
 
@@ -9055,7 +9055,7 @@ static void ggml_compute_forward_concat_f32(
9055
  const struct ggml_tensor * src0 = dst->src[0];
9056
  const struct ggml_tensor * src1 = dst->src[1];
9057
 
9058
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9059
  return;
9060
  }
9061
 
@@ -9127,7 +9127,7 @@ static void ggml_compute_forward_abs_f32(
9127
  assert(params->ith == 0);
9128
  assert(ggml_are_same_shape(src0, dst));
9129
 
9130
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9131
  return;
9132
  }
9133
 
@@ -9173,7 +9173,7 @@ static void ggml_compute_forward_sgn_f32(
9173
  assert(params->ith == 0);
9174
  assert(ggml_are_same_shape(src0, dst));
9175
 
9176
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9177
  return;
9178
  }
9179
 
@@ -9219,7 +9219,7 @@ static void ggml_compute_forward_neg_f32(
9219
  assert(params->ith == 0);
9220
  assert(ggml_are_same_shape(src0, dst));
9221
 
9222
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9223
  return;
9224
  }
9225
 
@@ -9265,7 +9265,7 @@ static void ggml_compute_forward_step_f32(
9265
  assert(params->ith == 0);
9266
  assert(ggml_are_same_shape(src0, dst));
9267
 
9268
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9269
  return;
9270
  }
9271
 
@@ -9311,7 +9311,7 @@ static void ggml_compute_forward_tanh_f32(
9311
  assert(params->ith == 0);
9312
  assert(ggml_are_same_shape(src0, dst));
9313
 
9314
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9315
  return;
9316
  }
9317
 
@@ -9357,7 +9357,7 @@ static void ggml_compute_forward_elu_f32(
9357
  assert(params->ith == 0);
9358
  assert(ggml_are_same_shape(src0, dst));
9359
 
9360
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9361
  return;
9362
  }
9363
 
@@ -9403,7 +9403,7 @@ static void ggml_compute_forward_relu_f32(
9403
  assert(params->ith == 0);
9404
  assert(ggml_are_same_shape(src0, dst));
9405
 
9406
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9407
  return;
9408
  }
9409
 
@@ -9450,7 +9450,7 @@ static void ggml_compute_forward_gelu_f32(
9450
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9451
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9452
 
9453
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9454
  return;
9455
  }
9456
 
@@ -9513,7 +9513,7 @@ static void ggml_compute_forward_gelu_quick_f32(
9513
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9514
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9515
 
9516
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9517
  return;
9518
  }
9519
 
@@ -9576,7 +9576,7 @@ static void ggml_compute_forward_silu_f32(
9576
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9577
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9578
 
9579
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9580
  return;
9581
  }
9582
 
@@ -9637,7 +9637,7 @@ static void ggml_compute_forward_leaky_relu_f32(
9637
  assert(params->ith == 0);
9638
  assert(ggml_are_same_shape(src0, dst));
9639
 
9640
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9641
  return;
9642
  }
9643
 
@@ -9690,7 +9690,7 @@ static void ggml_compute_forward_silu_back_f32(
9690
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9691
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
9692
 
9693
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9694
  return;
9695
  }
9696
 
@@ -9752,7 +9752,7 @@ static void ggml_compute_forward_hardswish_f32(
9752
  assert(params->ith == 0);
9753
  assert(ggml_are_same_shape(src0, dst));
9754
 
9755
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9756
  return;
9757
  }
9758
 
@@ -9795,7 +9795,7 @@ static void ggml_compute_forward_hardsigmoid_f32(
9795
  assert(params->ith == 0);
9796
  assert(ggml_are_same_shape(src0, dst));
9797
 
9798
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9799
  return;
9800
  }
9801
 
@@ -9841,7 +9841,7 @@ static void ggml_compute_forward_norm_f32(
9841
 
9842
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9843
 
9844
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9845
  return;
9846
  }
9847
 
@@ -9916,7 +9916,7 @@ static void ggml_compute_forward_rms_norm_f32(
9916
 
9917
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9918
 
9919
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9920
  return;
9921
  }
9922
 
@@ -9987,7 +9987,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
9987
 
9988
  GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
9989
 
9990
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9991
  return;
9992
  }
9993
 
@@ -10165,7 +10165,7 @@ static void ggml_compute_forward_group_norm_f32(
10165
 
10166
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10167
 
10168
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10169
  return;
10170
  }
10171
 
@@ -10332,7 +10332,7 @@ static void ggml_compute_forward_mul_mat(
10332
 
10333
  #if defined(GGML_USE_CLBLAST)
10334
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
10335
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
10336
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
10337
  }
10338
  return;
@@ -10345,7 +10345,7 @@ static void ggml_compute_forward_mul_mat(
10345
  const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
10346
  UNUSED(desired_wsize);
10347
 
10348
- if (params->type == GGML_TASK_INIT) {
10349
  if (type != GGML_TYPE_F32) {
10350
  assert(params->wsize >= desired_wsize);
10351
  // parallelize by src0 rows
@@ -10368,7 +10368,7 @@ static void ggml_compute_forward_mul_mat(
10368
  return;
10369
  }
10370
 
10371
- if (params->type == GGML_TASK_FINALIZE) {
10372
  return;
10373
  }
10374
 
@@ -10406,7 +10406,7 @@ static void ggml_compute_forward_mul_mat(
10406
  }
10407
  #endif
10408
 
10409
- if (params->type == GGML_TASK_INIT) {
10410
  if (ith != 0) {
10411
  return;
10412
  }
@@ -10430,7 +10430,7 @@ static void ggml_compute_forward_mul_mat(
10430
  return;
10431
  }
10432
 
10433
- if (params->type == GGML_TASK_FINALIZE) {
10434
  return;
10435
  }
10436
 
@@ -10587,7 +10587,7 @@ static void ggml_compute_forward_mul_mat_id(
10587
 
10588
  #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
10589
 
10590
- if (params->type == GGML_TASK_INIT) {
10591
  if (ith != 0) {
10592
  return;
10593
  }
@@ -10624,7 +10624,7 @@ static void ggml_compute_forward_mul_mat_id(
10624
  return;
10625
  }
10626
 
10627
- if (params->type == GGML_TASK_FINALIZE) {
10628
  return;
10629
  }
10630
 
@@ -10772,7 +10772,7 @@ static void ggml_compute_forward_out_prod_f32(
10772
  (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
10773
  #endif
10774
 
10775
- if (params->type == GGML_TASK_INIT) {
10776
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
10777
  if (use_blas) {
10778
  return;
@@ -10785,7 +10785,7 @@ static void ggml_compute_forward_out_prod_f32(
10785
  return;
10786
  }
10787
 
10788
- if (params->type == GGML_TASK_FINALIZE) {
10789
  return;
10790
  }
10791
 
@@ -10965,7 +10965,7 @@ static void ggml_compute_forward_out_prod_q_f32(
10965
  // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
10966
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
10967
 
10968
- if (params->type == GGML_TASK_INIT) {
10969
  if (ith != 0) {
10970
  return;
10971
  }
@@ -10973,7 +10973,7 @@ static void ggml_compute_forward_out_prod_q_f32(
10973
  return;
10974
  }
10975
 
10976
- if (params->type == GGML_TASK_FINALIZE) {
10977
  return;
10978
  }
10979
 
@@ -11091,7 +11091,7 @@ static void ggml_compute_forward_scale_f32(
11091
  GGML_ASSERT(ggml_is_contiguous(dst));
11092
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11093
 
11094
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11095
  return;
11096
  }
11097
 
@@ -11163,7 +11163,7 @@ static void ggml_compute_forward_set_f32(
11163
  size_t offset = ((int32_t *) dst->op_params)[3];
11164
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
11165
 
11166
- if (!inplace && (params->type == GGML_TASK_INIT)) {
11167
  if (params->ith != 0) {
11168
  return;
11169
  }
@@ -11175,7 +11175,7 @@ static void ggml_compute_forward_set_f32(
11175
  ggml_nbytes(dst));
11176
  }
11177
 
11178
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11179
  return;
11180
  }
11181
 
@@ -11323,7 +11323,7 @@ static void ggml_compute_forward_get_rows_q(
11323
 
11324
  assert(params->ith == 0);
11325
 
11326
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11327
  return;
11328
  }
11329
 
@@ -11363,7 +11363,7 @@ static void ggml_compute_forward_get_rows_f16(
11363
 
11364
  assert(params->ith == 0);
11365
 
11366
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11367
  return;
11368
  }
11369
 
@@ -11400,7 +11400,7 @@ static void ggml_compute_forward_get_rows_f32(
11400
 
11401
  assert(params->ith == 0);
11402
 
11403
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11404
  return;
11405
  }
11406
 
@@ -11503,14 +11503,14 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
11503
 
11504
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11505
 
11506
- if (params->type == GGML_TASK_INIT) {
11507
  if (params->ith != 0) {
11508
  return;
11509
  }
11510
  memset(dst->data, 0, ggml_nbytes(dst));
11511
  }
11512
 
11513
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11514
  return;
11515
  }
11516
 
@@ -11542,14 +11542,14 @@ static void ggml_compute_forward_get_rows_back_f32(
11542
 
11543
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11544
 
11545
- if (params->type == GGML_TASK_INIT) {
11546
  if (params->ith != 0) {
11547
  return;
11548
  }
11549
  memset(dst->data, 0, ggml_nbytes(dst));
11550
  }
11551
 
11552
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11553
  return;
11554
  }
11555
 
@@ -11619,7 +11619,7 @@ static void ggml_compute_forward_diag_f32(
11619
 
11620
  GGML_ASSERT(params->ith == 0);
11621
 
11622
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11623
  return;
11624
  }
11625
 
@@ -11688,7 +11688,7 @@ static void ggml_compute_forward_diag_mask_f32(
11688
 
11689
  GGML_ASSERT(n_past >= 0);
11690
 
11691
- if (!inplace && (params->type == GGML_TASK_INIT)) {
11692
  if (ith != 0) {
11693
  return;
11694
  }
@@ -11702,7 +11702,7 @@ static void ggml_compute_forward_diag_mask_f32(
11702
  ggml_nbytes(dst));
11703
  }
11704
 
11705
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11706
  return;
11707
  }
11708
 
@@ -11776,7 +11776,7 @@ static void ggml_compute_forward_soft_max_f32(
11776
  assert(ggml_is_contiguous(dst));
11777
  assert(ggml_are_same_shape(src0, dst));
11778
 
11779
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11780
  return;
11781
  }
11782
 
@@ -11914,7 +11914,7 @@ static void ggml_compute_forward_soft_max_back_f32(
11914
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11915
  GGML_ASSERT(ggml_are_same_shape(src1, dst));
11916
 
11917
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11918
  return;
11919
  }
11920
 
@@ -12008,7 +12008,7 @@ static void ggml_compute_forward_alibi_f32(
12008
 
12009
  assert(params->ith == 0);
12010
 
12011
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12012
  return;
12013
  }
12014
 
@@ -12067,7 +12067,7 @@ static void ggml_compute_forward_alibi_f16(
12067
 
12068
  assert(params->ith == 0);
12069
 
12070
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12071
  return;
12072
  }
12073
 
@@ -12174,7 +12174,7 @@ static void ggml_compute_forward_clamp_f32(
12174
 
12175
  assert(params->ith == 0);
12176
 
12177
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12178
  return;
12179
  }
12180
 
@@ -12314,7 +12314,7 @@ static void ggml_compute_forward_rope_f32(
12314
  const struct ggml_tensor * src0 = dst->src[0];
12315
  const struct ggml_tensor * src1 = dst->src[1];
12316
 
12317
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12318
  return;
12319
  }
12320
 
@@ -12492,7 +12492,7 @@ static void ggml_compute_forward_rope_f16(
12492
  const struct ggml_tensor * src0 = dst->src[0];
12493
  const struct ggml_tensor * src1 = dst->src[1];
12494
 
12495
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12496
  return;
12497
  }
12498
 
@@ -12723,7 +12723,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12723
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12724
  GGML_ASSERT(nb10 == sizeof(float));
12725
 
12726
- if (params->type == GGML_TASK_INIT) {
12727
  if (ith != 0) {
12728
  return;
12729
  }
@@ -12763,7 +12763,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12763
  return;
12764
  }
12765
 
12766
- if (params->type == GGML_TASK_FINALIZE) {
12767
  return;
12768
  }
12769
 
@@ -12822,7 +12822,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12822
  GGML_ASSERT(nb00 == sizeof(float));
12823
  GGML_ASSERT(nb10 == sizeof(float));
12824
 
12825
- if (params->type == GGML_TASK_INIT) {
12826
  if (ith != 0) {
12827
  return;
12828
  }
@@ -12862,7 +12862,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12862
  return;
12863
  }
12864
 
12865
- if (params->type == GGML_TASK_FINALIZE) {
12866
  return;
12867
  }
12868
 
@@ -12966,11 +12966,11 @@ static void ggml_compute_forward_im2col_f32(
12966
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12967
  GGML_ASSERT(nb10 == sizeof(float));
12968
 
12969
- if (params->type == GGML_TASK_INIT) {
12970
  return;
12971
  }
12972
 
12973
- if (params->type == GGML_TASK_FINALIZE) {
12974
  return;
12975
  }
12976
 
@@ -13054,11 +13054,11 @@ static void ggml_compute_forward_im2col_f16(
13054
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13055
  GGML_ASSERT(nb10 == sizeof(float));
13056
 
13057
- if (params->type == GGML_TASK_INIT) {
13058
  return;
13059
  }
13060
 
13061
- if (params->type == GGML_TASK_FINALIZE) {
13062
  return;
13063
  }
13064
 
@@ -13140,7 +13140,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13140
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13141
  GGML_ASSERT(nb10 == sizeof(float));
13142
 
13143
- if (params->type == GGML_TASK_INIT) {
13144
  if (ith != 0) {
13145
  return;
13146
  }
@@ -13182,7 +13182,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13182
  return;
13183
  }
13184
 
13185
- if (params->type == GGML_TASK_FINALIZE) {
13186
  return;
13187
  }
13188
 
@@ -13234,7 +13234,7 @@ static void ggml_compute_forward_pool_1d_sk_p0(
13234
  assert(src->type == GGML_TYPE_F32);
13235
  assert(params->ith == 0);
13236
 
13237
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13238
  return;
13239
  }
13240
 
@@ -13303,7 +13303,7 @@ static void ggml_compute_forward_pool_2d(
13303
  GGML_ASSERT(src->type == GGML_TYPE_F32);
13304
  GGML_ASSERT(params->ith == 0);
13305
 
13306
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13307
  return;
13308
  }
13309
 
@@ -13376,7 +13376,7 @@ static void ggml_compute_forward_upscale_f32(
13376
 
13377
  const struct ggml_tensor * src0 = dst->src[0];
13378
 
13379
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13380
  return;
13381
  }
13382
 
@@ -13436,7 +13436,7 @@ static void ggml_compute_forward_pad_f32(
13436
 
13437
  const struct ggml_tensor * src0 = dst->src[0];
13438
 
13439
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13440
  return;
13441
  }
13442
 
@@ -13497,7 +13497,7 @@ static void ggml_compute_forward_argsort_f32(
13497
 
13498
  const struct ggml_tensor * src0 = dst->src[0];
13499
 
13500
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13501
  return;
13502
  }
13503
 
@@ -13523,8 +13523,8 @@ static void ggml_compute_forward_argsort_f32(
13523
  // C doesn't have a functional sort, so we do a bubble sort instead
13524
  for (int64_t j = 0; j < ne0; j++) {
13525
  for (int64_t k = j + 1; k < ne0; k++) {
13526
- if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
13527
- (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
13528
  int32_t tmp = dst_data[j];
13529
  dst_data[j] = dst_data[k];
13530
  dst_data[k] = tmp;
@@ -13607,11 +13607,11 @@ static void ggml_compute_forward_flash_attn_f32(
13607
  GGML_ASSERT(nb1 <= nb2);
13608
  GGML_ASSERT(nb2 <= nb3);
13609
 
13610
- if (params->type == GGML_TASK_INIT) {
13611
  return;
13612
  }
13613
 
13614
- if (params->type == GGML_TASK_FINALIZE) {
13615
  return;
13616
  }
13617
 
@@ -13799,11 +13799,11 @@ static void ggml_compute_forward_flash_attn_f16(
13799
  GGML_ASSERT(nb1 <= nb2);
13800
  GGML_ASSERT(nb2 <= nb3);
13801
 
13802
- if (params->type == GGML_TASK_INIT) {
13803
  return;
13804
  }
13805
 
13806
- if (params->type == GGML_TASK_FINALIZE) {
13807
  return;
13808
  }
13809
 
@@ -14058,11 +14058,11 @@ static void ggml_compute_forward_flash_ff_f16(
14058
  GGML_ASSERT(nb1 <= nb2);
14059
  GGML_ASSERT(nb2 <= nb3);
14060
 
14061
- if (params->type == GGML_TASK_INIT) {
14062
  return;
14063
  }
14064
 
14065
- if (params->type == GGML_TASK_FINALIZE) {
14066
  return;
14067
  }
14068
 
@@ -14217,14 +14217,14 @@ static void ggml_compute_forward_flash_attn_back_f32(
14217
  GGML_ASSERT(nb1 <= nb2);
14218
  GGML_ASSERT(nb2 <= nb3);
14219
 
14220
- if (params->type == GGML_TASK_INIT) {
14221
  if (ith == 0) {
14222
  memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
14223
  }
14224
  return;
14225
  }
14226
 
14227
- if (params->type == GGML_TASK_FINALIZE) {
14228
  return;
14229
  }
14230
 
@@ -14540,7 +14540,7 @@ static void ggml_compute_forward_win_part_f32(
14540
 
14541
  const struct ggml_tensor * src0 = dst->src[0];
14542
 
14543
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14544
  return;
14545
  }
14546
 
@@ -14606,7 +14606,7 @@ static void ggml_compute_forward_win_unpart_f32(
14606
 
14607
  const struct ggml_tensor * src0 = dst->src[0];
14608
 
14609
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14610
  return;
14611
  }
14612
 
@@ -14734,7 +14734,7 @@ static void ggml_compute_forward_get_rel_pos_f16(
14734
 
14735
  const struct ggml_tensor * src0 = dst->src[0];
14736
 
14737
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14738
  return;
14739
  }
14740
 
@@ -14786,14 +14786,14 @@ static void ggml_compute_forward_add_rel_pos_f32(
14786
  const struct ggml_tensor * src2 = dst->src[2];
14787
 
14788
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14789
- if (!inplace && params->type == GGML_TASK_INIT) {
14790
  if (params->ith != 0) {
14791
  return;
14792
  }
14793
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14794
  return;
14795
  }
14796
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14797
  return;
14798
  }
14799
 
@@ -14875,7 +14875,7 @@ static void ggml_compute_forward_map_unary_f32(
14875
 
14876
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
14877
 
14878
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14879
  return;
14880
  }
14881
 
@@ -14924,7 +14924,7 @@ static void ggml_compute_forward_map_binary_f32(
14924
  assert(params->ith == 0);
14925
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
14926
 
14927
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14928
  return;
14929
  }
14930
 
@@ -14973,7 +14973,7 @@ static void ggml_compute_forward_map_custom1_f32(
14973
 
14974
  assert(params->ith == 0);
14975
 
14976
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14977
  return;
14978
  }
14979
 
@@ -14992,7 +14992,7 @@ static void ggml_compute_forward_map_custom2_f32(
14992
 
14993
  assert(params->ith == 0);
14994
 
14995
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14996
  return;
14997
  }
14998
 
@@ -15012,7 +15012,7 @@ static void ggml_compute_forward_map_custom3_f32(
15012
 
15013
  assert(params->ith == 0);
15014
 
15015
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15016
  return;
15017
  }
15018
 
@@ -15027,7 +15027,7 @@ static void ggml_compute_forward_map_custom1(
15027
 
15028
  const struct ggml_tensor * a = dst->src[0];
15029
 
15030
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15031
  return;
15032
  }
15033
 
@@ -15045,7 +15045,7 @@ static void ggml_compute_forward_map_custom2(
15045
  const struct ggml_tensor * a = dst->src[0];
15046
  const struct ggml_tensor * b = dst->src[1];
15047
 
15048
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15049
  return;
15050
  }
15051
 
@@ -15064,7 +15064,7 @@ static void ggml_compute_forward_map_custom3(
15064
  const struct ggml_tensor * b = dst->src[1];
15065
  const struct ggml_tensor * c = dst->src[2];
15066
 
15067
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15068
  return;
15069
  }
15070
 
@@ -15098,14 +15098,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15098
 
15099
  GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15100
 
15101
- if (params->type == GGML_TASK_INIT) {
15102
  if (ith == 0) {
15103
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
15104
  }
15105
  return;
15106
  }
15107
 
15108
- if (params->type == GGML_TASK_FINALIZE) {
15109
  if (ith == 0) {
15110
  float * dp = (float *) dst->data;
15111
  ggml_vec_sum_f32(nth, dp, sums);
@@ -15220,7 +15220,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15220
  const int64_t ith = params->ith;
15221
  const int64_t nth = params->nth;
15222
 
15223
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15224
  return;
15225
  }
15226
 
@@ -15327,8 +15327,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15327
  if (skip_cpu) {
15328
  return;
15329
  }
15330
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
15331
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
15332
  #elif defined(GGML_USE_VULKAN)
15333
  const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
15334
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -15339,8 +15339,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15339
  if (skip_cpu) {
15340
  return;
15341
  }
15342
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
15343
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
15344
  #endif // GGML_USE_CUBLAS
15345
 
15346
  #ifdef GGML_USE_SYCL
@@ -16886,7 +16886,7 @@ size_t ggml_graph_overhead(void) {
16886
 
16887
  struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
16888
  const size_t obj_size = ggml_graph_nbytes(size, grads);
16889
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
16890
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
16891
 
16892
  struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
@@ -17433,7 +17433,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17433
  set_numa_thread_affinity(state->ith);
17434
 
17435
  int node_n = -1;
17436
- int task_phase = GGML_TASK_FINALIZE;
17437
 
17438
  while (true) {
17439
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -17445,7 +17445,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17445
  // all other threads are finished and spinning
17446
  // do finalize and init here so we don't have synchronize again
17447
  struct ggml_compute_params params = {
17448
- /*.type =*/ GGML_TASK_FINALIZE,
17449
  /*.ith =*/ 0,
17450
  /*.nth =*/ 0,
17451
  /*.wsize =*/ cplan->work_size,
@@ -17476,17 +17476,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17476
  if (n_tasks == 1) {
17477
  /* INIT */
17478
  if (GGML_OP_HAS_INIT[node->op]) {
17479
- params.type = GGML_TASK_INIT;
17480
  ggml_compute_forward(&params, node);
17481
  }
17482
 
17483
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
17484
  // they do something more efficient than spinning (?)
17485
- params.type = GGML_TASK_COMPUTE;
17486
  ggml_compute_forward(&params, node);
17487
 
17488
  if (GGML_OP_HAS_FINALIZE[node->op]) {
17489
- params.type = GGML_TASK_FINALIZE;
17490
  ggml_compute_forward(&params, node);
17491
  }
17492
 
@@ -17500,7 +17500,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17500
  }
17501
  }
17502
 
17503
- task_phase = GGML_TASK_INIT;
17504
  atomic_store(&state->shared->n_active, n_threads);
17505
  atomic_store(&state->shared->node_n, node_n);
17506
  atomic_store(&state->shared->node_task, task_phase);
@@ -17517,7 +17517,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17517
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
17518
 
17519
  struct ggml_compute_params params = {
17520
- /*.type =*/ GGML_TASK_INIT,
17521
  /*.ith =*/ state->ith,
17522
  /*.nth =*/ n_tasks,
17523
  /*.wsize =*/ cplan->work_size,
@@ -17531,7 +17531,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17531
  }
17532
 
17533
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17534
- task_phase = GGML_TASK_COMPUTE;
17535
  atomic_store(&state->shared->n_active, n_threads);
17536
  atomic_store(&state->shared->node_task, task_phase);
17537
  }
@@ -17546,12 +17546,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17546
  }
17547
 
17548
  if (state->ith < n_tasks) {
17549
- params.type = GGML_TASK_COMPUTE;
17550
  ggml_compute_forward(&params, node);
17551
  }
17552
 
17553
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17554
- task_phase = GGML_TASK_FINALIZE;
17555
  atomic_store(&state->shared->n_active, n_threads);
17556
  atomic_store(&state->shared->node_task, task_phase);
17557
  }
@@ -17787,7 +17787,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17787
  /*.n_threads =*/ n_threads,
17788
  /*.n_active =*/ n_threads,
17789
  /*.node_n =*/ -1,
17790
- /*.node_task =*/ GGML_TASK_FINALIZE,
17791
  /*.abort_callback =*/ NULL,
17792
  /*.abort_callback_data =*/ NULL,
17793
  };
@@ -17855,7 +17855,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17855
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
17856
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
17857
 
17858
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
17859
 
17860
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
17861
 
@@ -18663,7 +18663,7 @@ static enum ggml_opt_result ggml_opt_adam(
18663
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18664
 
18665
  struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18666
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18667
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18668
 
18669
  bool cancel = false;
@@ -18675,7 +18675,7 @@ static enum ggml_opt_result ggml_opt_adam(
18675
  if (callback) {
18676
  callback(callback_data, accum_step, &sched, &cancel);
18677
  if (cancel) {
18678
- return GGML_OPT_CANCEL;
18679
  }
18680
  }
18681
  // ggml_graph_reset (gf);
@@ -18766,7 +18766,7 @@ static enum ggml_opt_result ggml_opt_adam(
18766
  if (callback) {
18767
  callback(callback_data, accum_step, &sched, &cancel);
18768
  if (cancel) {
18769
- return GGML_OPT_CANCEL;;
18770
  }
18771
  }
18772
  // ggml_graph_reset (gf);
@@ -18783,7 +18783,7 @@ static enum ggml_opt_result ggml_opt_adam(
18783
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
18784
  GGML_PRINT_DEBUG("converged\n");
18785
 
18786
- return GGML_OPT_OK;
18787
  }
18788
 
18789
  // delta-based convergence test
@@ -18793,7 +18793,7 @@ static enum ggml_opt_result ggml_opt_adam(
18793
  const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
18794
 
18795
  if (fabsf(rate) < params.delta) {
18796
- return GGML_OPT_OK;
18797
  }
18798
  }
18799
 
@@ -18809,7 +18809,7 @@ static enum ggml_opt_result ggml_opt_adam(
18809
  ++n_no_improvement[0];
18810
 
18811
  if (n_no_improvement[0] >= params.max_no_improvement) {
18812
- return GGML_OPT_OK;
18813
  }
18814
  }
18815
  }
@@ -18827,7 +18827,7 @@ static enum ggml_opt_result ggml_opt_adam(
18827
  }
18828
  }
18829
 
18830
- return GGML_OPT_DID_NOT_CONVERGE;
18831
  }
18832
 
18833
  //
@@ -18908,7 +18908,7 @@ static enum ggml_opt_result linesearch_backtracking(
18908
  float sched = 0;
18909
  callback(callback_data, accum_step, &sched, cancel);
18910
  if (*cancel) {
18911
- return GGML_OPT_CANCEL;
18912
  }
18913
  }
18914
  // ggml_graph_reset (gf);
@@ -18981,7 +18981,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18981
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18982
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18983
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
18984
- return GGML_OPT_INVALID_WOLFE;
18985
  }
18986
  }
18987
 
@@ -19010,7 +19010,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19010
  }
19011
 
19012
  struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
19013
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
19014
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
19015
 
19016
  float * x = opt->lbfgs.x->data; // current parameters
@@ -19051,7 +19051,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19051
  float sched = 0;
19052
  callback(callback_data, accum_step, &sched, &cancel);
19053
  if (cancel) {
19054
- return GGML_OPT_CANCEL;
19055
  }
19056
  }
19057
  // ggml_graph_reset (gf);
@@ -19079,7 +19079,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19079
 
19080
  // already optimized
19081
  if (gnorm/xnorm <= params.lbfgs.eps) {
19082
- return GGML_OPT_OK;
19083
  }
19084
 
19085
  if (opt->just_initialized) {
@@ -19124,7 +19124,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19124
  // way to test and don't want to break something with so many changes lined up
19125
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
19126
  if (cancel) {
19127
- return GGML_OPT_CANCEL;
19128
  }
19129
 
19130
  if (ls < 0) {
@@ -19147,7 +19147,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19147
  }
19148
  if (gnorm/xnorm <= params.lbfgs.eps) {
19149
  // converged
19150
- return GGML_OPT_OK;
19151
  }
19152
 
19153
  // delta-based convergence test
@@ -19157,7 +19157,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19157
  const float rate = (pf[k[0]%params.past] - fx)/fx;
19158
 
19159
  if (fabsf(rate) < params.delta) {
19160
- return GGML_OPT_OK;
19161
  }
19162
  }
19163
 
@@ -19173,14 +19173,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19173
  n_no_improvement[0]++;
19174
 
19175
  if (n_no_improvement[0] >= params.max_no_improvement) {
19176
- return GGML_OPT_OK;
19177
  }
19178
  }
19179
  }
19180
 
19181
  if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
19182
  // reached the maximum number of iterations
19183
- return GGML_OPT_DID_NOT_CONVERGE;
19184
  }
19185
 
19186
  // update vectors s and y:
@@ -19236,17 +19236,17 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19236
 
19237
  GGML_ASSERT(false && "lbfgs failed");
19238
 
19239
- return GGML_OPT_DID_NOT_CONVERGE;
19240
  }
19241
 
19242
  struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
19243
  struct ggml_opt_params result;
19244
 
19245
  switch (type) {
19246
- case GGML_OPT_ADAM:
19247
  {
19248
  result = (struct ggml_opt_params) {
19249
- .type = GGML_OPT_ADAM,
19250
  .graph_size = GGML_DEFAULT_GRAPH_SIZE,
19251
  .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
19252
  .past = 0,
@@ -19274,10 +19274,10 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
19274
  },
19275
  };
19276
  } break;
19277
- case GGML_OPT_LBFGS:
19278
  {
19279
  result = (struct ggml_opt_params) {
19280
- .type = GGML_OPT_LBFGS,
19281
  .graph_size = GGML_DEFAULT_GRAPH_SIZE,
19282
  .n_threads = 1,
19283
  .past = 0,
@@ -19322,12 +19322,12 @@ GGML_API void ggml_opt_init(
19322
  opt->just_initialized = true;
19323
  if (opt->ctx == NULL) {
19324
  struct ggml_init_params ctx_opt_params;
19325
- if (opt->params.type == GGML_OPT_ADAM) {
19326
  ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
19327
  if (opt->params.past > 0) {
19328
  ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
19329
  }
19330
- } else if (opt->params.type == GGML_OPT_LBFGS) {
19331
  ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
19332
  if (opt->params.past > 0) {
19333
  ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
@@ -19339,7 +19339,7 @@ GGML_API void ggml_opt_init(
19339
  opt->ctx = ggml_init(ctx_opt_params);
19340
  }
19341
  switch (opt->params.type) {
19342
- case GGML_OPT_ADAM:
19343
  {
19344
  opt->adam.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
19345
  opt->adam.m = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
@@ -19353,7 +19353,7 @@ GGML_API void ggml_opt_init(
19353
  ggml_set_zero(opt->adam.pf);
19354
  }
19355
  } break;
19356
- case GGML_OPT_LBFGS:
19357
  {
19358
  opt->lbfgs.x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
19359
  opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
@@ -19397,13 +19397,13 @@ enum ggml_opt_result ggml_opt(
19397
 
19398
  ctx = ggml_init(params_ctx);
19399
  if (ctx == NULL) {
19400
- return GGML_OPT_NO_CONTEXT;
19401
  }
19402
 
19403
  free_ctx = true;
19404
  }
19405
 
19406
- enum ggml_opt_result result = GGML_OPT_OK;
19407
 
19408
  struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
19409
 
@@ -19442,14 +19442,14 @@ enum ggml_opt_result ggml_opt_resume_g(
19442
  void * callback_data) {
19443
 
19444
  // build forward + backward compute graphs
19445
- enum ggml_opt_result result = GGML_OPT_OK;
19446
 
19447
  switch (opt->params.type) {
19448
- case GGML_OPT_ADAM:
19449
  {
19450
  result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19451
  } break;
19452
- case GGML_OPT_LBFGS:
19453
  {
19454
  result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19455
  } break;
 
2725
  }
2726
  }
2727
 
2728
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
2729
 
2730
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
2731
 
 
2733
 
2734
  *result = (struct ggml_tensor) {
2735
  /*.type =*/ type,
2736
+ /*.backend =*/ GGML_BACKEND_TYPE_CPU,
2737
  /*.buffer =*/ NULL,
2738
  /*.ne =*/ { 1, 1, 1, 1 },
2739
  /*.nb =*/ { 0, 0, 0, 0 },
 
3306
  char * const mem_buffer = ctx->mem_buffer;
3307
 
3308
  while (obj != NULL) {
3309
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3310
  return (struct ggml_tensor *)(mem_buffer + obj->offs);
3311
  }
3312
 
 
3323
  char * const mem_buffer = ctx->mem_buffer;
3324
 
3325
  while (obj != NULL) {
3326
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3327
  return (struct ggml_tensor *)(mem_buffer + obj->offs);
3328
  }
3329
 
 
3339
  char * const mem_buffer = ctx->mem_buffer;
3340
 
3341
  while (obj != NULL) {
3342
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3343
  struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
3344
  if (strcmp(cur->name, name) == 0) {
3345
  return cur;
 
5883
  int k) {
5884
  GGML_ASSERT(a->ne[0] >= k);
5885
 
5886
+ struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
5887
 
5888
  result = ggml_view_4d(ctx, result,
5889
  k, result->ne[1], result->ne[2], result->ne[3],
 
6677
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
6678
  GGML_ASSERT(src0->type == dst->type);
6679
 
6680
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6681
  return;
6682
  }
6683
 
 
6709
 
6710
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6711
 
6712
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6713
  return;
6714
  }
6715
 
 
6982
 
6983
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6984
 
6985
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6986
  return;
6987
  }
6988
 
 
7235
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
7236
  GGML_ASSERT(src0->type == dst->type);
7237
 
7238
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7239
  return;
7240
  }
7241
 
 
7415
 
7416
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7417
 
7418
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7419
  return;
7420
  }
7421
 
 
7423
  const int nth = params->nth;
7424
 
7425
  #ifdef GGML_USE_CLBLAST
7426
+ if (src1->backend == GGML_BACKEND_TYPE_GPU) {
7427
  // TODO: OpenCL kernel support full broadcast
7428
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7429
  if (ith == 0) {
 
7505
 
7506
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7507
 
7508
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7509
  return;
7510
  }
7511
 
 
7584
 
7585
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7586
 
7587
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7588
  return;
7589
  }
7590
 
 
7640
 
7641
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7642
 
7643
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7644
  return;
7645
  }
7646
 
 
7778
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7779
  GGML_ASSERT(ggml_is_scalar(src1));
7780
 
7781
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7782
  return;
7783
  }
7784
 
 
7832
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7833
  GGML_ASSERT(ggml_is_scalar(src1));
7834
 
7835
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7836
  return;
7837
  }
7838
 
 
7884
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7885
  GGML_ASSERT(ggml_is_scalar(src1));
7886
 
7887
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7888
  return;
7889
  }
7890
 
 
7936
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7937
  GGML_ASSERT(ggml_is_scalar(src1));
7938
 
7939
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7940
  return;
7941
  }
7942
 
 
8066
  size_t offset = ((int32_t *) dst->op_params)[3];
8067
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
8068
 
8069
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
8070
  if (params->ith != 0) {
8071
  return;
8072
  }
 
8078
  ggml_nbytes(dst));
8079
  }
8080
 
8081
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8082
  return;
8083
  }
8084
 
 
8180
  assert(params->ith == 0);
8181
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
8182
 
8183
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8184
  return;
8185
  }
8186
 
 
8261
 
8262
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8263
 
8264
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8265
  return;
8266
  }
8267
  const int ith = params->ith;
8268
  const int nth = params->nth;
8269
 
8270
  #if defined(GGML_USE_CLBLAST)
8271
+ if (src1->backend == GGML_BACKEND_TYPE_GPU) {
8272
  // TODO: OpenCL kernel support full broadcast
8273
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
8274
  if (ith == 0) {
 
8369
 
8370
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8371
 
8372
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8373
  return;
8374
  }
8375
 
 
8464
  assert(params->ith == 0);
8465
  assert(ggml_are_same_shape(src0, dst));
8466
 
8467
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8468
  return;
8469
  }
8470
 
 
8510
  assert(params->ith == 0);
8511
  assert(ggml_are_same_shape(src0, dst));
8512
 
8513
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8514
  return;
8515
  }
8516
 
 
8556
  GGML_ASSERT(params->ith == 0);
8557
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8558
 
8559
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8560
  return;
8561
  }
8562
 
 
8602
  assert(params->ith == 0);
8603
  assert(ggml_is_scalar(dst));
8604
 
8605
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8606
  return;
8607
  }
8608
 
 
8637
  assert(params->ith == 0);
8638
  assert(ggml_is_scalar(dst));
8639
 
8640
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8641
  return;
8642
  }
8643
 
 
8694
 
8695
  GGML_ASSERT(params->ith == 0);
8696
 
8697
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8698
  return;
8699
  }
8700
 
 
8749
 
8750
  assert(params->ith == 0);
8751
 
8752
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8753
  return;
8754
  }
8755
 
 
8808
 
8809
  assert(params->ith == 0);
8810
 
8811
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8812
  return;
8813
  }
8814
 
 
8859
  GGML_ASSERT(params->ith == 0);
8860
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8861
 
8862
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8863
  return;
8864
  }
8865
 
 
8904
  GGML_ASSERT(params->ith == 0);
8905
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8906
 
8907
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8908
  return;
8909
  }
8910
 
 
8978
  GGML_ASSERT(params->ith == 0);
8979
  GGML_ASSERT(ggml_can_repeat(dst, src0));
8980
 
8981
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8982
  return;
8983
  }
8984
 
 
9055
  const struct ggml_tensor * src0 = dst->src[0];
9056
  const struct ggml_tensor * src1 = dst->src[1];
9057
 
9058
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9059
  return;
9060
  }
9061
 
 
9127
  assert(params->ith == 0);
9128
  assert(ggml_are_same_shape(src0, dst));
9129
 
9130
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9131
  return;
9132
  }
9133
 
 
9173
  assert(params->ith == 0);
9174
  assert(ggml_are_same_shape(src0, dst));
9175
 
9176
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9177
  return;
9178
  }
9179
 
 
9219
  assert(params->ith == 0);
9220
  assert(ggml_are_same_shape(src0, dst));
9221
 
9222
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9223
  return;
9224
  }
9225
 
 
9265
  assert(params->ith == 0);
9266
  assert(ggml_are_same_shape(src0, dst));
9267
 
9268
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9269
  return;
9270
  }
9271
 
 
9311
  assert(params->ith == 0);
9312
  assert(ggml_are_same_shape(src0, dst));
9313
 
9314
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9315
  return;
9316
  }
9317
 
 
9357
  assert(params->ith == 0);
9358
  assert(ggml_are_same_shape(src0, dst));
9359
 
9360
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9361
  return;
9362
  }
9363
 
 
9403
  assert(params->ith == 0);
9404
  assert(ggml_are_same_shape(src0, dst));
9405
 
9406
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9407
  return;
9408
  }
9409
 
 
9450
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9451
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9452
 
9453
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9454
  return;
9455
  }
9456
 
 
9513
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9514
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9515
 
9516
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9517
  return;
9518
  }
9519
 
 
9576
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9577
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9578
 
9579
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9580
  return;
9581
  }
9582
 
 
9637
  assert(params->ith == 0);
9638
  assert(ggml_are_same_shape(src0, dst));
9639
 
9640
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9641
  return;
9642
  }
9643
 
 
9690
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9691
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
9692
 
9693
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9694
  return;
9695
  }
9696
 
 
9752
  assert(params->ith == 0);
9753
  assert(ggml_are_same_shape(src0, dst));
9754
 
9755
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9756
  return;
9757
  }
9758
 
 
9795
  assert(params->ith == 0);
9796
  assert(ggml_are_same_shape(src0, dst));
9797
 
9798
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9799
  return;
9800
  }
9801
 
 
9841
 
9842
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9843
 
9844
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9845
  return;
9846
  }
9847
 
 
9916
 
9917
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9918
 
9919
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9920
  return;
9921
  }
9922
 
 
9987
 
9988
  GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
9989
 
9990
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9991
  return;
9992
  }
9993
 
 
10165
 
10166
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10167
 
10168
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
10169
  return;
10170
  }
10171
 
 
10332
 
10333
  #if defined(GGML_USE_CLBLAST)
10334
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
10335
+ if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
10336
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
10337
  }
10338
  return;
 
10345
  const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
10346
  UNUSED(desired_wsize);
10347
 
10348
+ if (params->type == GGML_TASK_TYPE_INIT) {
10349
  if (type != GGML_TYPE_F32) {
10350
  assert(params->wsize >= desired_wsize);
10351
  // parallelize by src0 rows
 
10368
  return;
10369
  }
10370
 
10371
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10372
  return;
10373
  }
10374
 
 
10406
  }
10407
  #endif
10408
 
10409
+ if (params->type == GGML_TASK_TYPE_INIT) {
10410
  if (ith != 0) {
10411
  return;
10412
  }
 
10430
  return;
10431
  }
10432
 
10433
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10434
  return;
10435
  }
10436
 
 
10587
 
10588
  #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
10589
 
10590
+ if (params->type == GGML_TASK_TYPE_INIT) {
10591
  if (ith != 0) {
10592
  return;
10593
  }
 
10624
  return;
10625
  }
10626
 
10627
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10628
  return;
10629
  }
10630
 
 
10772
  (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
10773
  #endif
10774
 
10775
+ if (params->type == GGML_TASK_TYPE_INIT) {
10776
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
10777
  if (use_blas) {
10778
  return;
 
10785
  return;
10786
  }
10787
 
10788
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10789
  return;
10790
  }
10791
 
 
10965
  // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
10966
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
10967
 
10968
+ if (params->type == GGML_TASK_TYPE_INIT) {
10969
  if (ith != 0) {
10970
  return;
10971
  }
 
10973
  return;
10974
  }
10975
 
10976
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10977
  return;
10978
  }
10979
 
 
11091
  GGML_ASSERT(ggml_is_contiguous(dst));
11092
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11093
 
11094
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11095
  return;
11096
  }
11097
 
 
11163
  size_t offset = ((int32_t *) dst->op_params)[3];
11164
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
11165
 
11166
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
11167
  if (params->ith != 0) {
11168
  return;
11169
  }
 
11175
  ggml_nbytes(dst));
11176
  }
11177
 
11178
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11179
  return;
11180
  }
11181
 
 
11323
 
11324
  assert(params->ith == 0);
11325
 
11326
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11327
  return;
11328
  }
11329
 
 
11363
 
11364
  assert(params->ith == 0);
11365
 
11366
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11367
  return;
11368
  }
11369
 
 
11400
 
11401
  assert(params->ith == 0);
11402
 
11403
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11404
  return;
11405
  }
11406
 
 
11503
 
11504
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11505
 
11506
+ if (params->type == GGML_TASK_TYPE_INIT) {
11507
  if (params->ith != 0) {
11508
  return;
11509
  }
11510
  memset(dst->data, 0, ggml_nbytes(dst));
11511
  }
11512
 
11513
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11514
  return;
11515
  }
11516
 
 
11542
 
11543
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11544
 
11545
+ if (params->type == GGML_TASK_TYPE_INIT) {
11546
  if (params->ith != 0) {
11547
  return;
11548
  }
11549
  memset(dst->data, 0, ggml_nbytes(dst));
11550
  }
11551
 
11552
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11553
  return;
11554
  }
11555
 
 
11619
 
11620
  GGML_ASSERT(params->ith == 0);
11621
 
11622
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11623
  return;
11624
  }
11625
 
 
11688
 
11689
  GGML_ASSERT(n_past >= 0);
11690
 
11691
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
11692
  if (ith != 0) {
11693
  return;
11694
  }
 
11702
  ggml_nbytes(dst));
11703
  }
11704
 
11705
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11706
  return;
11707
  }
11708
 
 
11776
  assert(ggml_is_contiguous(dst));
11777
  assert(ggml_are_same_shape(src0, dst));
11778
 
11779
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11780
  return;
11781
  }
11782
 
 
11914
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11915
  GGML_ASSERT(ggml_are_same_shape(src1, dst));
11916
 
11917
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11918
  return;
11919
  }
11920
 
 
12008
 
12009
  assert(params->ith == 0);
12010
 
12011
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12012
  return;
12013
  }
12014
 
 
12067
 
12068
  assert(params->ith == 0);
12069
 
12070
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12071
  return;
12072
  }
12073
 
 
12174
 
12175
  assert(params->ith == 0);
12176
 
12177
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12178
  return;
12179
  }
12180
 
 
12314
  const struct ggml_tensor * src0 = dst->src[0];
12315
  const struct ggml_tensor * src1 = dst->src[1];
12316
 
12317
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12318
  return;
12319
  }
12320
 
 
12492
  const struct ggml_tensor * src0 = dst->src[0];
12493
  const struct ggml_tensor * src1 = dst->src[1];
12494
 
12495
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12496
  return;
12497
  }
12498
 
 
12723
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12724
  GGML_ASSERT(nb10 == sizeof(float));
12725
 
12726
+ if (params->type == GGML_TASK_TYPE_INIT) {
12727
  if (ith != 0) {
12728
  return;
12729
  }
 
12763
  return;
12764
  }
12765
 
12766
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12767
  return;
12768
  }
12769
 
 
12822
  GGML_ASSERT(nb00 == sizeof(float));
12823
  GGML_ASSERT(nb10 == sizeof(float));
12824
 
12825
+ if (params->type == GGML_TASK_TYPE_INIT) {
12826
  if (ith != 0) {
12827
  return;
12828
  }
 
12862
  return;
12863
  }
12864
 
12865
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12866
  return;
12867
  }
12868
 
 
12966
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12967
  GGML_ASSERT(nb10 == sizeof(float));
12968
 
12969
+ if (params->type == GGML_TASK_TYPE_INIT) {
12970
  return;
12971
  }
12972
 
12973
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12974
  return;
12975
  }
12976
 
 
13054
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13055
  GGML_ASSERT(nb10 == sizeof(float));
13056
 
13057
+ if (params->type == GGML_TASK_TYPE_INIT) {
13058
  return;
13059
  }
13060
 
13061
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13062
  return;
13063
  }
13064
 
 
13140
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13141
  GGML_ASSERT(nb10 == sizeof(float));
13142
 
13143
+ if (params->type == GGML_TASK_TYPE_INIT) {
13144
  if (ith != 0) {
13145
  return;
13146
  }
 
13182
  return;
13183
  }
13184
 
13185
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13186
  return;
13187
  }
13188
 
 
13234
  assert(src->type == GGML_TYPE_F32);
13235
  assert(params->ith == 0);
13236
 
13237
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13238
  return;
13239
  }
13240
 
 
13303
  GGML_ASSERT(src->type == GGML_TYPE_F32);
13304
  GGML_ASSERT(params->ith == 0);
13305
 
13306
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13307
  return;
13308
  }
13309
 
 
13376
 
13377
  const struct ggml_tensor * src0 = dst->src[0];
13378
 
13379
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13380
  return;
13381
  }
13382
 
 
13436
 
13437
  const struct ggml_tensor * src0 = dst->src[0];
13438
 
13439
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13440
  return;
13441
  }
13442
 
 
13497
 
13498
  const struct ggml_tensor * src0 = dst->src[0];
13499
 
13500
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13501
  return;
13502
  }
13503
 
 
13523
  // C doesn't have a functional sort, so we do a bubble sort instead
13524
  for (int64_t j = 0; j < ne0; j++) {
13525
  for (int64_t k = j + 1; k < ne0; k++) {
13526
+ if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
13527
+ (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
13528
  int32_t tmp = dst_data[j];
13529
  dst_data[j] = dst_data[k];
13530
  dst_data[k] = tmp;
 
13607
  GGML_ASSERT(nb1 <= nb2);
13608
  GGML_ASSERT(nb2 <= nb3);
13609
 
13610
+ if (params->type == GGML_TASK_TYPE_INIT) {
13611
  return;
13612
  }
13613
 
13614
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13615
  return;
13616
  }
13617
 
 
13799
  GGML_ASSERT(nb1 <= nb2);
13800
  GGML_ASSERT(nb2 <= nb3);
13801
 
13802
+ if (params->type == GGML_TASK_TYPE_INIT) {
13803
  return;
13804
  }
13805
 
13806
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13807
  return;
13808
  }
13809
 
 
14058
  GGML_ASSERT(nb1 <= nb2);
14059
  GGML_ASSERT(nb2 <= nb3);
14060
 
14061
+ if (params->type == GGML_TASK_TYPE_INIT) {
14062
  return;
14063
  }
14064
 
14065
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
14066
  return;
14067
  }
14068
 
 
14217
  GGML_ASSERT(nb1 <= nb2);
14218
  GGML_ASSERT(nb2 <= nb3);
14219
 
14220
+ if (params->type == GGML_TASK_TYPE_INIT) {
14221
  if (ith == 0) {
14222
  memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
14223
  }
14224
  return;
14225
  }
14226
 
14227
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
14228
  return;
14229
  }
14230
 
 
14540
 
14541
  const struct ggml_tensor * src0 = dst->src[0];
14542
 
14543
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14544
  return;
14545
  }
14546
 
 
14606
 
14607
  const struct ggml_tensor * src0 = dst->src[0];
14608
 
14609
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14610
  return;
14611
  }
14612
 
 
14734
 
14735
  const struct ggml_tensor * src0 = dst->src[0];
14736
 
14737
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14738
  return;
14739
  }
14740
 
 
14786
  const struct ggml_tensor * src2 = dst->src[2];
14787
 
14788
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14789
+ if (!inplace && params->type == GGML_TASK_TYPE_INIT) {
14790
  if (params->ith != 0) {
14791
  return;
14792
  }
14793
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14794
  return;
14795
  }
14796
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14797
  return;
14798
  }
14799
 
 
14875
 
14876
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
14877
 
14878
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14879
  return;
14880
  }
14881
 
 
14924
  assert(params->ith == 0);
14925
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
14926
 
14927
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14928
  return;
14929
  }
14930
 
 
14973
 
14974
  assert(params->ith == 0);
14975
 
14976
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14977
  return;
14978
  }
14979
 
 
14992
 
14993
  assert(params->ith == 0);
14994
 
14995
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14996
  return;
14997
  }
14998
 
 
15012
 
15013
  assert(params->ith == 0);
15014
 
15015
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15016
  return;
15017
  }
15018
 
 
15027
 
15028
  const struct ggml_tensor * a = dst->src[0];
15029
 
15030
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15031
  return;
15032
  }
15033
 
 
15045
  const struct ggml_tensor * a = dst->src[0];
15046
  const struct ggml_tensor * b = dst->src[1];
15047
 
15048
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15049
  return;
15050
  }
15051
 
 
15064
  const struct ggml_tensor * b = dst->src[1];
15065
  const struct ggml_tensor * c = dst->src[2];
15066
 
15067
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15068
  return;
15069
  }
15070
 
 
15098
 
15099
  GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15100
 
15101
+ if (params->type == GGML_TASK_TYPE_INIT) {
15102
  if (ith == 0) {
15103
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
15104
  }
15105
  return;
15106
  }
15107
 
15108
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
15109
  if (ith == 0) {
15110
  float * dp = (float *) dst->data;
15111
  ggml_vec_sum_f32(nth, dp, sums);
 
15220
  const int64_t ith = params->ith;
15221
  const int64_t nth = params->nth;
15222
 
15223
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15224
  return;
15225
  }
15226
 
 
15327
  if (skip_cpu) {
15328
  return;
15329
  }
15330
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15331
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15332
  #elif defined(GGML_USE_VULKAN)
15333
  const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
15334
  #ifdef GGML_VULKAN_CHECK_RESULTS
 
15339
  if (skip_cpu) {
15340
  return;
15341
  }
15342
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15343
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15344
  #endif // GGML_USE_CUBLAS
15345
 
15346
  #ifdef GGML_USE_SYCL
 
16886
 
16887
  struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
16888
  const size_t obj_size = ggml_graph_nbytes(size, grads);
16889
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
16890
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
16891
 
16892
  struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
 
17433
  set_numa_thread_affinity(state->ith);
17434
 
17435
  int node_n = -1;
17436
+ int task_phase = GGML_TASK_TYPE_FINALIZE;
17437
 
17438
  while (true) {
17439
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
 
17445
  // all other threads are finished and spinning
17446
  // do finalize and init here so we don't have synchronize again
17447
  struct ggml_compute_params params = {
17448
+ /*.type =*/ GGML_TASK_TYPE_FINALIZE,
17449
  /*.ith =*/ 0,
17450
  /*.nth =*/ 0,
17451
  /*.wsize =*/ cplan->work_size,
 
17476
  if (n_tasks == 1) {
17477
  /* INIT */
17478
  if (GGML_OP_HAS_INIT[node->op]) {
17479
+ params.type = GGML_TASK_TYPE_INIT;
17480
  ggml_compute_forward(&params, node);
17481
  }
17482
 
17483
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
17484
  // they do something more efficient than spinning (?)
17485
+ params.type = GGML_TASK_TYPE_COMPUTE;
17486
  ggml_compute_forward(&params, node);
17487
 
17488
  if (GGML_OP_HAS_FINALIZE[node->op]) {
17489
+ params.type = GGML_TASK_TYPE_FINALIZE;
17490
  ggml_compute_forward(&params, node);
17491
  }
17492
 
 
17500
  }
17501
  }
17502
 
17503
+ task_phase = GGML_TASK_TYPE_INIT;
17504
  atomic_store(&state->shared->n_active, n_threads);
17505
  atomic_store(&state->shared->node_n, node_n);
17506
  atomic_store(&state->shared->node_task, task_phase);
 
17517
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
17518
 
17519
  struct ggml_compute_params params = {
17520
+ /*.type =*/ GGML_TASK_TYPE_INIT,
17521
  /*.ith =*/ state->ith,
17522
  /*.nth =*/ n_tasks,
17523
  /*.wsize =*/ cplan->work_size,
 
17531
  }
17532
 
17533
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17534
+ task_phase = GGML_TASK_TYPE_COMPUTE;
17535
  atomic_store(&state->shared->n_active, n_threads);
17536
  atomic_store(&state->shared->node_task, task_phase);
17537
  }
 
17546
  }
17547
 
17548
  if (state->ith < n_tasks) {
17549
+ params.type = GGML_TASK_TYPE_COMPUTE;
17550
  ggml_compute_forward(&params, node);
17551
  }
17552
 
17553
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17554
+ task_phase = GGML_TASK_TYPE_FINALIZE;
17555
  atomic_store(&state->shared->n_active, n_threads);
17556
  atomic_store(&state->shared->node_task, task_phase);
17557
  }
 
17787
  /*.n_threads =*/ n_threads,
17788
  /*.n_active =*/ n_threads,
17789
  /*.node_n =*/ -1,
17790
+ /*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
17791
  /*.abort_callback =*/ NULL,
17792
  /*.abort_callback_data =*/ NULL,
17793
  };
 
17855
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
17856
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
17857
 
17858
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
17859
 
17860
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
17861
 
 
18663
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18664
 
18665
  struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18666
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
18667
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18668
 
18669
  bool cancel = false;
 
18675
  if (callback) {
18676
  callback(callback_data, accum_step, &sched, &cancel);
18677
  if (cancel) {
18678
+ return GGML_OPT_RESULT_CANCEL;
18679
  }
18680
  }
18681
  // ggml_graph_reset (gf);
 
18766
  if (callback) {
18767
  callback(callback_data, accum_step, &sched, &cancel);
18768
  if (cancel) {
18769
+ return GGML_OPT_RESULT_CANCEL;;
18770
  }
18771
  }
18772
  // ggml_graph_reset (gf);
 
18783
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
18784
  GGML_PRINT_DEBUG("converged\n");
18785
 
18786
+ return GGML_OPT_RESULT_OK;
18787
  }
18788
 
18789
  // delta-based convergence test
 
18793
  const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
18794
 
18795
  if (fabsf(rate) < params.delta) {
18796
+ return GGML_OPT_RESULT_OK;
18797
  }
18798
  }
18799
 
 
18809
  ++n_no_improvement[0];
18810
 
18811
  if (n_no_improvement[0] >= params.max_no_improvement) {
18812
+ return GGML_OPT_RESULT_OK;
18813
  }
18814
  }
18815
  }
 
18827
  }
18828
  }
18829
 
18830
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
18831
  }
18832
 
18833
  //
 
18908
  float sched = 0;
18909
  callback(callback_data, accum_step, &sched, cancel);
18910
  if (*cancel) {
18911
+ return GGML_OPT_RESULT_CANCEL;
18912
  }
18913
  }
18914
  // ggml_graph_reset (gf);
 
18981
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18982
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18983
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
18984
+ return GGML_OPT_RESULT_INVALID_WOLFE;
18985
  }
18986
  }
18987
 
 
19010
  }
19011
 
19012
  struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
19013
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
19014
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
19015
 
19016
  float * x = opt->lbfgs.x->data; // current parameters
 
19051
  float sched = 0;
19052
  callback(callback_data, accum_step, &sched, &cancel);
19053
  if (cancel) {
19054
+ return GGML_OPT_RESULT_CANCEL;
19055
  }
19056
  }
19057
  // ggml_graph_reset (gf);
 
19079
 
19080
  // already optimized
19081
  if (gnorm/xnorm <= params.lbfgs.eps) {
19082
+ return GGML_OPT_RESULT_OK;
19083
  }
19084
 
19085
  if (opt->just_initialized) {
 
19124
  // way to test and don't want to break something with so many changes lined up
19125
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
19126
  if (cancel) {
19127
+ return GGML_OPT_RESULT_CANCEL;
19128
  }
19129
 
19130
  if (ls < 0) {
 
19147
  }
19148
  if (gnorm/xnorm <= params.lbfgs.eps) {
19149
  // converged
19150
+ return GGML_OPT_RESULT_OK;
19151
  }
19152
 
19153
  // delta-based convergence test
 
19157
  const float rate = (pf[k[0]%params.past] - fx)/fx;
19158
 
19159
  if (fabsf(rate) < params.delta) {
19160
+ return GGML_OPT_RESULT_OK;
19161
  }
19162
  }
19163
 
 
19173
  n_no_improvement[0]++;
19174
 
19175
  if (n_no_improvement[0] >= params.max_no_improvement) {
19176
+ return GGML_OPT_RESULT_OK;
19177
  }
19178
  }
19179
  }
19180
 
19181
  if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
19182
  // reached the maximum number of iterations
19183
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
19184
  }
19185
 
19186
  // update vectors s and y:
 
19236
 
19237
  GGML_ASSERT(false && "lbfgs failed");
19238
 
19239
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
19240
  }
19241
 
19242
  struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
19243
  struct ggml_opt_params result;
19244
 
19245
  switch (type) {
19246
+ case GGML_OPT_TYPE_ADAM:
19247
  {
19248
  result = (struct ggml_opt_params) {
19249
+ .type = GGML_OPT_TYPE_ADAM,
19250
  .graph_size = GGML_DEFAULT_GRAPH_SIZE,
19251
  .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
19252
  .past = 0,
 
19274
  },
19275
  };
19276
  } break;
19277
+ case GGML_OPT_TYPE_LBFGS:
19278
  {
19279
  result = (struct ggml_opt_params) {
19280
+ .type = GGML_OPT_TYPE_LBFGS,
19281
  .graph_size = GGML_DEFAULT_GRAPH_SIZE,
19282
  .n_threads = 1,
19283
  .past = 0,
 
19322
  opt->just_initialized = true;
19323
  if (opt->ctx == NULL) {
19324
  struct ggml_init_params ctx_opt_params;
19325
+ if (opt->params.type == GGML_OPT_TYPE_ADAM) {
19326
  ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
19327
  if (opt->params.past > 0) {
19328
  ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
19329
  }
19330
+ } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) {
19331
  ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
19332
  if (opt->params.past > 0) {
19333
  ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
 
19339
  opt->ctx = ggml_init(ctx_opt_params);
19340
  }
19341
  switch (opt->params.type) {
19342
+ case GGML_OPT_TYPE_ADAM:
19343
  {
19344
  opt->adam.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
19345
  opt->adam.m = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
 
19353
  ggml_set_zero(opt->adam.pf);
19354
  }
19355
  } break;
19356
+ case GGML_OPT_TYPE_LBFGS:
19357
  {
19358
  opt->lbfgs.x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
19359
  opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
 
19397
 
19398
  ctx = ggml_init(params_ctx);
19399
  if (ctx == NULL) {
19400
+ return GGML_OPT_RESULT_NO_CONTEXT;
19401
  }
19402
 
19403
  free_ctx = true;
19404
  }
19405
 
19406
+ enum ggml_opt_result result = GGML_OPT_RESULT_OK;
19407
 
19408
  struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
19409
 
 
19442
  void * callback_data) {
19443
 
19444
  // build forward + backward compute graphs
19445
+ enum ggml_opt_result result = GGML_OPT_RESULT_OK;
19446
 
19447
  switch (opt->params.type) {
19448
+ case GGML_OPT_TYPE_ADAM:
19449
  {
19450
  result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19451
  } break;
19452
+ case GGML_OPT_TYPE_LBFGS:
19453
  {
19454
  result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19455
  } break;
ggml.h CHANGED
@@ -364,9 +364,9 @@ extern "C" {
364
  };
365
 
366
  enum ggml_backend_type {
367
- GGML_BACKEND_CPU = 0,
368
- GGML_BACKEND_GPU = 10,
369
- GGML_BACKEND_GPU_SPLIT = 20,
370
  };
371
 
372
  // model file types
@@ -498,9 +498,9 @@ extern "C" {
498
  };
499
 
500
  enum ggml_object_type {
501
- GGML_OBJECT_TENSOR,
502
- GGML_OBJECT_GRAPH,
503
- GGML_OBJECT_WORK_BUFFER
504
  };
505
 
506
  enum ggml_log_level {
@@ -642,9 +642,9 @@ extern "C" {
642
  // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
643
  // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
644
  enum ggml_task_type {
645
- GGML_TASK_INIT = 0,
646
- GGML_TASK_COMPUTE,
647
- GGML_TASK_FINALIZE,
648
  };
649
 
650
  struct ggml_compute_params {
@@ -1659,8 +1659,8 @@ extern "C" {
1659
 
1660
  // sort rows
1661
  enum ggml_sort_order {
1662
- GGML_SORT_ASC,
1663
- GGML_SORT_DESC,
1664
  };
1665
 
1666
  GGML_API struct ggml_tensor * ggml_argsort(
@@ -1953,8 +1953,8 @@ extern "C" {
1953
 
1954
  // optimization methods
1955
  enum ggml_opt_type {
1956
- GGML_OPT_ADAM,
1957
- GGML_OPT_LBFGS,
1958
  };
1959
 
1960
  // linesearch methods
@@ -1968,12 +1968,12 @@ extern "C" {
1968
 
1969
  // optimization return values
1970
  enum ggml_opt_result {
1971
- GGML_OPT_OK = 0,
1972
- GGML_OPT_DID_NOT_CONVERGE,
1973
- GGML_OPT_NO_CONTEXT,
1974
- GGML_OPT_INVALID_WOLFE,
1975
- GGML_OPT_FAIL,
1976
- GGML_OPT_CANCEL,
1977
 
1978
  GGML_LINESEARCH_FAIL = -128,
1979
  GGML_LINESEARCH_MINIMUM_STEP,
 
364
  };
365
 
366
  enum ggml_backend_type {
367
+ GGML_BACKEND_TYPE_CPU = 0,
368
+ GGML_BACKEND_TYPE_GPU = 10,
369
+ GGML_BACKEND_TYPE_GPU_SPLIT = 20,
370
  };
371
 
372
  // model file types
 
498
  };
499
 
500
  enum ggml_object_type {
501
+ GGML_OBJECT_TYPE_TENSOR,
502
+ GGML_OBJECT_TYPE_GRAPH,
503
+ GGML_OBJECT_TYPE_WORK_BUFFER
504
  };
505
 
506
  enum ggml_log_level {
 
642
  // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
643
  // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
644
  enum ggml_task_type {
645
+ GGML_TASK_TYPE_INIT = 0,
646
+ GGML_TASK_TYPE_COMPUTE,
647
+ GGML_TASK_TYPE_FINALIZE,
648
  };
649
 
650
  struct ggml_compute_params {
 
1659
 
1660
  // sort rows
1661
  enum ggml_sort_order {
1662
+ GGML_SORT_ORDER_ASC,
1663
+ GGML_SORT_ORDER_DESC,
1664
  };
1665
 
1666
  GGML_API struct ggml_tensor * ggml_argsort(
 
1953
 
1954
  // optimization methods
1955
  enum ggml_opt_type {
1956
+ GGML_OPT_TYPE_ADAM,
1957
+ GGML_OPT_TYPE_LBFGS,
1958
  };
1959
 
1960
  // linesearch methods
 
1968
 
1969
  // optimization return values
1970
  enum ggml_opt_result {
1971
+ GGML_OPT_RESULT_OK = 0,
1972
+ GGML_OPT_RESULT_DID_NOT_CONVERGE,
1973
+ GGML_OPT_RESULT_NO_CONTEXT,
1974
+ GGML_OPT_RESULT_INVALID_WOLFE,
1975
+ GGML_OPT_RESULT_FAIL,
1976
+ GGML_OPT_RESULT_CANCEL,
1977
 
1978
  GGML_LINESEARCH_FAIL = -128,
1979
  GGML_LINESEARCH_MINIMUM_STEP,