Spaces:
Running
Running
Commit
·
26820f6
1
Parent(s):
418769d
CUDA: fix crash with partial offloading of MoE (llama/13439)
Browse files
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -1909,13 +1909,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
| 1909 |
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 1910 |
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
| 1911 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1912 |
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
| 1913 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
| 1914 |
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
| 1915 |
-
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
| 1916 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
| 1917 |
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
| 1918 |
-
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
| 1919 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
| 1920 |
|
| 1921 |
bool any_gpus_with_slow_fp16 = false;
|
|
|
|
| 1909 |
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 1910 |
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
| 1911 |
|
| 1912 |
+
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
|
| 1913 |
+
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
|
| 1914 |
+
// Therefore, in such cases use cuBLAS.
|
| 1915 |
+
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
|
| 1916 |
+
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
|
| 1917 |
+
|
| 1918 |
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
| 1919 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
| 1920 |
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
| 1921 |
+
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
| 1922 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
| 1923 |
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
| 1924 |
+
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
| 1925 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
| 1926 |
|
| 1927 |
bool any_gpus_with_slow_fp16 = false;
|
ggml/src/ggml-cuda/mmq.cu
CHANGED
|
@@ -91,11 +91,11 @@ void ggml_cuda_mul_mat_q(
|
|
| 91 |
|
| 92 |
// If src0 is a temporary compute buffer, clear any potential padding.
|
| 93 |
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 94 |
-
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
| 95 |
-
GGML_ASSERT(!src0->view_src);
|
| 96 |
const size_t size_data = ggml_nbytes(src0);
|
| 97 |
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
| 98 |
if (size_alloc > size_data) {
|
|
|
|
|
|
|
| 99 |
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
| 100 |
}
|
| 101 |
}
|
|
|
|
| 91 |
|
| 92 |
// If src0 is a temporary compute buffer, clear any potential padding.
|
| 93 |
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
|
|
|
|
|
|
| 94 |
const size_t size_data = ggml_nbytes(src0);
|
| 95 |
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
| 96 |
if (size_alloc > size_data) {
|
| 97 |
+
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
| 98 |
+
GGML_ASSERT(!src0->view_src);
|
| 99 |
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
| 100 |
}
|
| 101 |
}
|
ggml/src/ggml-cuda/mmvq.cu
CHANGED
|
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(
|
|
| 515 |
|
| 516 |
// If src0 is a temporary compute buffer, clear any potential padding.
|
| 517 |
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 518 |
-
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
| 519 |
-
GGML_ASSERT(!src0->view_src);
|
| 520 |
const size_t size_data = ggml_nbytes(src0);
|
| 521 |
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
| 522 |
if (size_alloc > size_data) {
|
|
|
|
|
|
|
| 523 |
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
| 524 |
}
|
| 525 |
}
|
|
|
|
| 515 |
|
| 516 |
// If src0 is a temporary compute buffer, clear any potential padding.
|
| 517 |
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
|
|
|
|
|
|
| 518 |
const size_t size_data = ggml_nbytes(src0);
|
| 519 |
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
| 520 |
if (size_alloc > size_data) {
|
| 521 |
+
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
| 522 |
+
GGML_ASSERT(!src0->view_src);
|
| 523 |
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
| 524 |
}
|
| 525 |
}
|