Spaces:
Running
Running
Commit
·
c3e51a2
1
Parent(s):
3628417
CUDA: fix logic for clearing padding with -ngl 0 (llama/13320)
Browse files- ggml/include/ggml-backend.h +2 -2
- ggml/src/ggml-backend.cpp +2 -2
- ggml/src/ggml-cuda/ggml-cuda.cu +8 -2
- ggml/src/ggml-cuda/mmq.cu +10 -0
- ggml/src/ggml-cuda/mmvq.cu +10 -0
- ggml/src/ggml-cuda/quantize.cu +1 -0
ggml/include/ggml-backend.h
CHANGED
|
@@ -38,7 +38,7 @@ extern "C" {
|
|
| 38 |
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
| 39 |
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
| 40 |
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
| 41 |
-
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
| 42 |
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
| 43 |
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
| 44 |
|
|
@@ -59,7 +59,7 @@ extern "C" {
|
|
| 59 |
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 60 |
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
| 61 |
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
| 62 |
-
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 63 |
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
| 64 |
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
| 65 |
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
|
|
|
| 38 |
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
| 39 |
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
| 40 |
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
| 41 |
+
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
| 42 |
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
| 43 |
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
| 44 |
|
|
|
|
| 59 |
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 60 |
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
| 61 |
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
| 62 |
+
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
|
| 63 |
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
| 64 |
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
| 65 |
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
ggml/src/ggml-backend.cpp
CHANGED
|
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
|
| 56 |
return SIZE_MAX;
|
| 57 |
}
|
| 58 |
|
| 59 |
-
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
| 60 |
// get_alloc_size is optional, defaults to ggml_nbytes
|
| 61 |
if (buft->iface.get_alloc_size) {
|
| 62 |
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
|
@@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
|
| 152 |
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
| 153 |
}
|
| 154 |
|
| 155 |
-
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 156 |
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
| 157 |
}
|
| 158 |
|
|
|
|
| 56 |
return SIZE_MAX;
|
| 57 |
}
|
| 58 |
|
| 59 |
+
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
| 60 |
// get_alloc_size is optional, defaults to ggml_nbytes
|
| 61 |
if (buft->iface.get_alloc_size) {
|
| 62 |
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
|
|
|
| 152 |
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
| 153 |
}
|
| 154 |
|
| 155 |
+
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
|
| 156 |
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
| 157 |
}
|
| 158 |
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -555,8 +555,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
|
|
| 555 |
|
| 556 |
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 557 |
// initialize padding to 0 to avoid possible NaN values
|
| 558 |
-
size_t original_size = ggml_nbytes(tensor);
|
| 559 |
-
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
| 560 |
|
| 561 |
if (padded_size > original_size) {
|
| 562 |
ggml_cuda_set_device(ctx->device);
|
|
@@ -679,6 +679,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|
| 679 |
|
| 680 |
if (ggml_is_quantized(tensor->type)) {
|
| 681 |
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
|
|
| 682 |
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
| 683 |
}
|
| 684 |
}
|
|
@@ -800,6 +801,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
|
|
| 800 |
|
| 801 |
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 802 |
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
|
|
|
| 803 |
|
| 804 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
| 805 |
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
|
@@ -851,6 +853,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
|
|
| 851 |
// split tensors must always be set in their entirety at once
|
| 852 |
GGML_ASSERT(offset == 0);
|
| 853 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
|
|
| 854 |
|
| 855 |
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
| 856 |
|
|
@@ -889,6 +892,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
|
|
| 889 |
// split tensors must always be set in their entirety at once
|
| 890 |
GGML_ASSERT(offset == 0);
|
| 891 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
|
|
| 892 |
|
| 893 |
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
| 894 |
|
|
@@ -970,6 +974,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
|
|
| 970 |
|
| 971 |
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 972 |
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
|
|
|
| 973 |
|
| 974 |
size_t total_size = 0;
|
| 975 |
|
|
@@ -2065,6 +2070,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
| 2065 |
src0_slice.ne[2] = 1;
|
| 2066 |
src0_slice.nb[3] = src0_slice.nb[2];
|
| 2067 |
src0_slice.data = (char *) src0->data + i02*nb02;
|
|
|
|
| 2068 |
|
| 2069 |
ggml_tensor src1_slice;
|
| 2070 |
memset(&src1_slice, 0, sizeof(src1_slice));
|
|
|
|
| 555 |
|
| 556 |
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 557 |
// initialize padding to 0 to avoid possible NaN values
|
| 558 |
+
const size_t original_size = ggml_nbytes(tensor);
|
| 559 |
+
const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
| 560 |
|
| 561 |
if (padded_size > original_size) {
|
| 562 |
ggml_cuda_set_device(ctx->device);
|
|
|
|
| 679 |
|
| 680 |
if (ggml_is_quantized(tensor->type)) {
|
| 681 |
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
| 682 |
+
GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
|
| 683 |
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
| 684 |
}
|
| 685 |
}
|
|
|
|
| 801 |
|
| 802 |
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 803 |
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
| 804 |
+
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
| 805 |
|
| 806 |
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
| 807 |
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
|
|
|
| 853 |
// split tensors must always be set in their entirety at once
|
| 854 |
GGML_ASSERT(offset == 0);
|
| 855 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
| 856 |
+
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
| 857 |
|
| 858 |
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
| 859 |
|
|
|
|
| 892 |
// split tensors must always be set in their entirety at once
|
| 893 |
GGML_ASSERT(offset == 0);
|
| 894 |
GGML_ASSERT(size == ggml_nbytes(tensor));
|
| 895 |
+
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
| 896 |
|
| 897 |
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
| 898 |
|
|
|
|
| 974 |
|
| 975 |
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 976 |
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
| 977 |
+
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
| 978 |
|
| 979 |
size_t total_size = 0;
|
| 980 |
|
|
|
|
| 2070 |
src0_slice.ne[2] = 1;
|
| 2071 |
src0_slice.nb[3] = src0_slice.nb[2];
|
| 2072 |
src0_slice.data = (char *) src0->data + i02*nb02;
|
| 2073 |
+
GGML_ASSERT(!ggml_cuda_should_use_mmq(src0->type, cc, ne11) || ne00 % MATRIX_ROW_PADDING == 0);
|
| 2074 |
|
| 2075 |
ggml_tensor src1_slice;
|
| 2076 |
memset(&src1_slice, 0, sizeof(src1_slice));
|
ggml/src/ggml-cuda/mmq.cu
CHANGED
|
@@ -89,6 +89,16 @@ void ggml_cuda_mul_mat_q(
|
|
| 89 |
const float * src1_d = (const float *) src1->data;
|
| 90 |
float * dst_d = (float *) dst->data;
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
| 93 |
|
| 94 |
const int64_t s01 = src0->nb[1] / ts_src0;
|
|
|
|
| 89 |
const float * src1_d = (const float *) src1->data;
|
| 90 |
float * dst_d = (float *) dst->data;
|
| 91 |
|
| 92 |
+
// If src0 is a temporary compute buffer, clear any potential padding.
|
| 93 |
+
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 94 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 95 |
+
const size_t size_data = ggml_nbytes(src0);
|
| 96 |
+
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
| 97 |
+
if (size_alloc > size_data) {
|
| 98 |
+
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
| 103 |
|
| 104 |
const int64_t s01 = src0->nb[1] / ts_src0;
|
ggml/src/ggml-cuda/mmvq.cu
CHANGED
|
@@ -513,6 +513,16 @@ void ggml_cuda_mul_mat_vec_q(
|
|
| 513 |
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
| 514 |
float * dst_d = (float *) dst->data;
|
| 515 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
| 517 |
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
|
| 518 |
{
|
|
|
|
| 513 |
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
| 514 |
float * dst_d = (float *) dst->data;
|
| 515 |
|
| 516 |
+
// If src0 is a temporary compute buffer, clear any potential padding.
|
| 517 |
+
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 518 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 519 |
+
const size_t size_data = ggml_nbytes(src0);
|
| 520 |
+
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
| 521 |
+
if (size_alloc > size_data) {
|
| 522 |
+
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
| 523 |
+
}
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
| 527 |
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
|
| 528 |
{
|
ggml/src/ggml-cuda/quantize.cu
CHANGED
|
@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
|
|
| 163 |
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
| 164 |
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
| 165 |
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
|
|
|
| 166 |
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
|
| 167 |
|
| 168 |
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
|
|
|
| 163 |
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
| 164 |
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
| 165 |
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
| 166 |
+
GGML_ASSERT(ne00 % 4 == 0);
|
| 167 |
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
|
| 168 |
|
| 169 |
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|