Spaces:
Running
Running
slaren
commited on
cuda : fix tensor size calculation for non-split buffer (llama/5145)
Browse files- ggml-backend.c +3 -1
- ggml-cuda.cu +5 -14
ggml-backend.c
CHANGED
|
@@ -30,7 +30,9 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
| 30 |
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
| 31 |
// get_alloc_size is optional, defaults to ggml_nbytes
|
| 32 |
if (buft->iface.get_alloc_size) {
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
}
|
| 35 |
return ggml_nbytes(tensor);
|
| 36 |
}
|
|
|
|
| 30 |
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
| 31 |
// get_alloc_size is optional, defaults to ggml_nbytes
|
| 32 |
if (buft->iface.get_alloc_size) {
|
| 33 |
+
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
| 34 |
+
assert(size >= ggml_nbytes(tensor));
|
| 35 |
+
return size;
|
| 36 |
}
|
| 37 |
return ggml_nbytes(tensor);
|
| 38 |
}
|
ggml-cuda.cu
CHANGED
|
@@ -9790,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
| 9790 |
// TODO: mmq/mmv support
|
| 9791 |
#endif
|
| 9792 |
|
| 9793 |
-
const
|
| 9794 |
-
const
|
| 9795 |
|
| 9796 |
const struct ggml_tensor * ids = src0;
|
| 9797 |
const int32_t id = ((int32_t *) dst->op_params)[0];
|
|
@@ -10304,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|
| 10304 |
|
| 10305 |
if (ggml_is_quantized(tensor->type)) {
|
| 10306 |
// initialize padding to 0 to avoid possible NaN values
|
| 10307 |
-
|
| 10308 |
-
int64_t row_high = ggml_nrows(tensor);
|
| 10309 |
-
int64_t nrows_split = row_high - row_low;
|
| 10310 |
-
|
| 10311 |
-
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
| 10312 |
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
| 10313 |
|
| 10314 |
if (padded_size > original_size && tensor->view_src == nullptr) {
|
| 10315 |
-
CUDA_CHECK(
|
| 10316 |
}
|
| 10317 |
}
|
| 10318 |
}
|
|
@@ -10415,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
|
|
| 10415 |
}
|
| 10416 |
|
| 10417 |
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 10418 |
-
|
| 10419 |
-
int64_t row_high = ggml_nrows(tensor);
|
| 10420 |
-
int64_t nrows_split = row_high - row_low;
|
| 10421 |
-
|
| 10422 |
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
| 10423 |
-
|
| 10424 |
int64_t ne0 = tensor->ne[0];
|
| 10425 |
|
| 10426 |
if (ggml_is_quantized(tensor->type)) {
|
|
|
|
| 9790 |
// TODO: mmq/mmv support
|
| 9791 |
#endif
|
| 9792 |
|
| 9793 |
+
const size_t nb11 = src1->nb[1];
|
| 9794 |
+
const size_t nb1 = dst->nb[1];
|
| 9795 |
|
| 9796 |
const struct ggml_tensor * ids = src0;
|
| 9797 |
const int32_t id = ((int32_t *) dst->op_params)[0];
|
|
|
|
| 10304 |
|
| 10305 |
if (ggml_is_quantized(tensor->type)) {
|
| 10306 |
// initialize padding to 0 to avoid possible NaN values
|
| 10307 |
+
size_t original_size = ggml_nbytes(tensor);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10308 |
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
| 10309 |
|
| 10310 |
if (padded_size > original_size && tensor->view_src == nullptr) {
|
| 10311 |
+
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
| 10312 |
}
|
| 10313 |
}
|
| 10314 |
}
|
|
|
|
| 10411 |
}
|
| 10412 |
|
| 10413 |
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 10414 |
+
size_t size = ggml_nbytes(tensor);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10415 |
int64_t ne0 = tensor->ne[0];
|
| 10416 |
|
| 10417 |
if (ggml_is_quantized(tensor->type)) {
|