Diego Devesa commited on
Commit
6b6155b
·
1 Parent(s): e4b1812

cuda : prevent using split buffers with 3d/4d matrices (llama/13919)

Browse files
Files changed (1) hide show
  1. ggml/src/ggml-cuda/ggml-cuda.cu +5 -2
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -2994,9 +2994,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
2994
  {
2995
  struct ggml_tensor * a = op->src[0];
2996
  struct ggml_tensor * b = op->src[1];
2997
- // for small weight matrices the active device can end up without any rows, don't use row split in those cases
2998
- // this avoids some edge cases (and the performance would not be good anyways)
2999
  if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
 
 
 
 
 
3000
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
3001
  int64_t row_low;
3002
  int64_t row_high;
 
2994
  {
2995
  struct ggml_tensor * a = op->src[0];
2996
  struct ggml_tensor * b = op->src[1];
 
 
2997
  if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
2998
+ if (a->ne[2] > 1 || a->ne[3] > 1) {
2999
+ return false;
3000
+ }
3001
+ // for small weight matrices the active device can end up without any rows, don't use row split in those cases
3002
+ // this avoids some edge cases (and the performance would not be good anyways)
3003
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
3004
  int64_t row_low;
3005
  int64_t row_high;