JohannesGaessler commited on
Commit
c3e51a2
·
1 Parent(s): 3628417

CUDA: fix logic for clearing padding with -ngl 0 (llama/13320)

Browse files
ggml/include/ggml-backend.h CHANGED
@@ -38,7 +38,7 @@ extern "C" {
38
  GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
39
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
40
  GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
41
- GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
42
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
43
  GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
44
 
@@ -59,7 +59,7 @@ extern "C" {
59
  GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
60
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
61
  GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
62
- GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
63
  GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
64
  GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
65
  GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
 
38
  GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
39
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
40
  GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
41
+ GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
42
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
43
  GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
44
 
 
59
  GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
60
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
61
  GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
62
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
63
  GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
64
  GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
65
  GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
ggml/src/ggml-backend.cpp CHANGED
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
56
  return SIZE_MAX;
57
  }
58
 
59
- size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
60
  // get_alloc_size is optional, defaults to ggml_nbytes
61
  if (buft->iface.get_alloc_size) {
62
  size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
152
  return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
153
  }
154
 
155
- size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
156
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
157
  }
158
 
 
56
  return SIZE_MAX;
57
  }
58
 
59
+ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
60
  // get_alloc_size is optional, defaults to ggml_nbytes
61
  if (buft->iface.get_alloc_size) {
62
  size_t size = buft->iface.get_alloc_size(buft, tensor);
 
152
  return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
153
  }
154
 
155
+ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
156
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
157
  }
158
 
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -555,8 +555,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
555
 
556
  if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
557
  // initialize padding to 0 to avoid possible NaN values
558
- size_t original_size = ggml_nbytes(tensor);
559
- size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
560
 
561
  if (padded_size > original_size) {
562
  ggml_cuda_set_device(ctx->device);
@@ -679,6 +679,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
679
 
680
  if (ggml_is_quantized(tensor->type)) {
681
  if (ne0 % MATRIX_ROW_PADDING != 0) {
 
682
  size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
683
  }
684
  }
@@ -800,6 +801,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
800
 
801
  static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
802
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
 
803
 
804
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
805
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
@@ -851,6 +853,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
851
  // split tensors must always be set in their entirety at once
852
  GGML_ASSERT(offset == 0);
853
  GGML_ASSERT(size == ggml_nbytes(tensor));
 
854
 
855
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
856
 
@@ -889,6 +892,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
889
  // split tensors must always be set in their entirety at once
890
  GGML_ASSERT(offset == 0);
891
  GGML_ASSERT(size == ggml_nbytes(tensor));
 
892
 
893
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
894
 
@@ -970,6 +974,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
970
 
971
  static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
972
  ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
 
973
 
974
  size_t total_size = 0;
975
 
@@ -2065,6 +2070,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2065
  src0_slice.ne[2] = 1;
2066
  src0_slice.nb[3] = src0_slice.nb[2];
2067
  src0_slice.data = (char *) src0->data + i02*nb02;
 
2068
 
2069
  ggml_tensor src1_slice;
2070
  memset(&src1_slice, 0, sizeof(src1_slice));
 
555
 
556
  if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
557
  // initialize padding to 0 to avoid possible NaN values
558
+ const size_t original_size = ggml_nbytes(tensor);
559
+ const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
560
 
561
  if (padded_size > original_size) {
562
  ggml_cuda_set_device(ctx->device);
 
679
 
680
  if (ggml_is_quantized(tensor->type)) {
681
  if (ne0 % MATRIX_ROW_PADDING != 0) {
682
+ GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
683
  size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
684
  }
685
  }
 
801
 
802
  static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
803
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
804
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
805
 
806
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
807
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
 
853
  // split tensors must always be set in their entirety at once
854
  GGML_ASSERT(offset == 0);
855
  GGML_ASSERT(size == ggml_nbytes(tensor));
856
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
857
 
858
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
859
 
 
892
  // split tensors must always be set in their entirety at once
893
  GGML_ASSERT(offset == 0);
894
  GGML_ASSERT(size == ggml_nbytes(tensor));
895
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
896
 
897
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
898
 
 
974
 
975
  static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
976
  ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
977
+ GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
978
 
979
  size_t total_size = 0;
980
 
 
2070
  src0_slice.ne[2] = 1;
2071
  src0_slice.nb[3] = src0_slice.nb[2];
2072
  src0_slice.data = (char *) src0->data + i02*nb02;
2073
+ GGML_ASSERT(!ggml_cuda_should_use_mmq(src0->type, cc, ne11) || ne00 % MATRIX_ROW_PADDING == 0);
2074
 
2075
  ggml_tensor src1_slice;
2076
  memset(&src1_slice, 0, sizeof(src1_slice));
ggml/src/ggml-cuda/mmq.cu CHANGED
@@ -89,6 +89,16 @@ void ggml_cuda_mul_mat_q(
89
  const float * src1_d = (const float *) src1->data;
90
  float * dst_d = (float *) dst->data;
91
 
 
 
 
 
 
 
 
 
 
 
92
  const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
93
 
94
  const int64_t s01 = src0->nb[1] / ts_src0;
 
89
  const float * src1_d = (const float *) src1->data;
90
  float * dst_d = (float *) dst->data;
91
 
92
+ // If src0 is a temporary compute buffer, clear any potential padding.
93
+ if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
94
+ GGML_ASSERT(ggml_is_contiguous(src0));
95
+ const size_t size_data = ggml_nbytes(src0);
96
+ const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
97
+ if (size_alloc > size_data) {
98
+ CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
99
+ }
100
+ }
101
+
102
  const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
103
 
104
  const int64_t s01 = src0->nb[1] / ts_src0;
ggml/src/ggml-cuda/mmvq.cu CHANGED
@@ -513,6 +513,16 @@ void ggml_cuda_mul_mat_vec_q(
513
  const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
514
  float * dst_d = (float *) dst->data;
515
 
 
 
 
 
 
 
 
 
 
 
516
  const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
517
  ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
518
  {
 
513
  const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
514
  float * dst_d = (float *) dst->data;
515
 
516
+ // If src0 is a temporary compute buffer, clear any potential padding.
517
+ if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
518
+ GGML_ASSERT(ggml_is_contiguous(src0));
519
+ const size_t size_data = ggml_nbytes(src0);
520
+ const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
521
+ if (size_alloc > size_data) {
522
+ CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
523
+ }
524
+ }
525
+
526
  const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
527
  ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
528
  {
ggml/src/ggml-cuda/quantize.cu CHANGED
@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
163
  const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
164
  const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
165
  const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
 
166
  GGML_ASSERT(ne0 % (4*QK8_1) == 0);
167
 
168
  const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
 
163
  const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
164
  const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
165
  const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
166
+ GGML_ASSERT(ne00 % 4 == 0);
167
  GGML_ASSERT(ne0 % (4*QK8_1) == 0);
168
 
169
  const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);