JohannesGaessler commited on
Commit
26820f6
·
1 Parent(s): 418769d

CUDA: fix crash with partial offloading of MoE (llama/13439)

Browse files
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -1909,13 +1909,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1909
  static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1910
  const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1911
 
 
 
 
 
 
 
1912
  bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1913
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1914
  && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
1915
- bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
1916
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1917
  && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1918
- bool use_mul_mat_q = ggml_is_quantized(src0->type)
1919
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1920
 
1921
  bool any_gpus_with_slow_fp16 = false;
 
1909
  static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1910
  const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1911
 
1912
+ // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
1913
+ // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
1914
+ // Therefore, in such cases use cuBLAS.
1915
+ const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
1916
+ && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
1917
+
1918
  bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1919
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1920
  && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
1921
+ bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
1922
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1923
  && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1924
+ bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
1925
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1926
 
1927
  bool any_gpus_with_slow_fp16 = false;
ggml/src/ggml-cuda/mmq.cu CHANGED
@@ -91,11 +91,11 @@ void ggml_cuda_mul_mat_q(
91
 
92
  // If src0 is a temporary compute buffer, clear any potential padding.
93
  if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
94
- GGML_ASSERT(ggml_is_contiguously_allocated(src0));
95
- GGML_ASSERT(!src0->view_src);
96
  const size_t size_data = ggml_nbytes(src0);
97
  const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
98
  if (size_alloc > size_data) {
 
 
99
  CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
100
  }
101
  }
 
91
 
92
  // If src0 is a temporary compute buffer, clear any potential padding.
93
  if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
 
 
94
  const size_t size_data = ggml_nbytes(src0);
95
  const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
96
  if (size_alloc > size_data) {
97
+ GGML_ASSERT(ggml_is_contiguously_allocated(src0));
98
+ GGML_ASSERT(!src0->view_src);
99
  CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
100
  }
101
  }
ggml/src/ggml-cuda/mmvq.cu CHANGED
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(
515
 
516
  // If src0 is a temporary compute buffer, clear any potential padding.
517
  if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
518
- GGML_ASSERT(ggml_is_contiguously_allocated(src0));
519
- GGML_ASSERT(!src0->view_src);
520
  const size_t size_data = ggml_nbytes(src0);
521
  const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
522
  if (size_alloc > size_data) {
 
 
523
  CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
524
  }
525
  }
 
515
 
516
  // If src0 is a temporary compute buffer, clear any potential padding.
517
  if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
 
 
518
  const size_t size_data = ggml_nbytes(src0);
519
  const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
520
  if (size_alloc > size_data) {
521
+ GGML_ASSERT(ggml_is_contiguously_allocated(src0));
522
+ GGML_ASSERT(!src0->view_src);
523
  CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
524
  }
525
  }