dou112 commited on
Commit
66b93b3
·
1 Parent(s): a33d74f

CANN: Optimize CANN buffer pool memory management (llama/12875)

Browse files

Multiple optional memory pools are provided for CANN, including VMM,
priority queue-based, and traditional memory pools.
1.When the memory pool is available and GGML_CANN_DISABLE_VMM_POOL
is not defined, the VMM pool is selected by default.
2.Otherwise, if GGML_CANN_ENABLE_BUF_PRIO_POOL is defined,
the priority queue-based memory pool is used.
3.If neither condition is met, the default memory pool is used.

ggml/src/ggml-cann/aclnn_ops.cpp CHANGED
@@ -1783,7 +1783,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1783
  src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
1784
  GGML_MAX_DIMS + 1);
1785
  aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
1786
- src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
1787
  GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
1788
  aclTensor* dequant_tensor = ggml_cann_create_tensor(
1789
  dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
 
1783
  src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
1784
  GGML_MAX_DIMS + 1);
1785
  aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
1786
+ src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
1787
  GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
1788
  aclTensor* dequant_tensor = ggml_cann_create_tensor(
1789
  dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
ggml/src/ggml-cann/ggml-cann.cpp CHANGED
@@ -29,6 +29,8 @@
29
  #include <cstdio>
30
  #include <cstring>
31
  #include <mutex>
 
 
32
 
33
  #include "ggml-impl.h"
34
  #include "ggml-backend-impl.h"
@@ -119,9 +121,10 @@ static ggml_cann_device_info ggml_cann_init() {
119
  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
120
  prop.location.id = id;
121
  prop.reserve = 0;
122
- ACL_CHECK(aclrtMemGetAllocationGranularity(
123
  &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
124
- &info.devices[id].vmm_granularity));
 
125
 
126
  size_t free, total;
127
  ggml_backend_cann_get_device_memory(id, &free, &total);
@@ -148,11 +151,222 @@ const ggml_cann_device_info& ggml_cann_info() {
148
 
149
  //#define DEBUG_CANN_MALLOC
150
  /**
151
- * @brief A pool of CANN buffers(legacy).
152
  *
153
  * This class manages a pool of CANN buffers for a specific device.
154
  */
155
- struct ggml_cann_pool_leg : public ggml_cann_pool {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  /**
157
  * @brief The maximum number of buffers in the pool.
158
  */
@@ -163,12 +377,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
163
  */
164
  int device;
165
 
 
 
 
 
 
166
  /**
167
  * @brief Structure representing a CANN buffer.
168
  */
169
  struct ggml_cann_buffer {
170
  void* ptr = nullptr; ///< Pointer to the buffer memory.
171
  size_t size = 0; ///< Size of the buffer.
 
 
172
  };
173
 
174
  /**
@@ -186,17 +407,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
186
  *
187
  * @param device The device ID to associate with this buffer pool.
188
  */
189
- explicit ggml_cann_pool_leg(int device) : device(device) {}
 
 
190
 
191
  /**
192
  * @brief Destructor to free all buffers in the pool.
193
  */
194
- ~ggml_cann_pool_leg() {
195
  ggml_cann_set_device(device);
196
  for (int i = 0; i < MAX_BUFFERS; ++i) {
197
  ggml_cann_buffer& b = buffer_pool[i];
198
  if (b.ptr != nullptr) {
199
- ACL_CHECK(aclrtFree(b.ptr));
200
  pool_size -= b.size;
201
  }
202
  }
@@ -212,63 +435,93 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
212
  * @return A pointer to the allocated buffer.
213
  */
214
  void* alloc(size_t size, size_t* actual_size) override {
215
- const size_t alignment = 128;
216
  size = GGML_PAD(size, alignment);
217
  if (size == 0) {
218
  size = alignment;
219
  }
220
- #ifdef DEBUG_CANN_MALLOC
221
- int nnz = 0;
222
- size_t max_size = 0;
223
- #endif
224
- size_t best_diff = 1ull << 36;
225
- int ibest = -1;
226
- for (int i = 0; i < MAX_BUFFERS; ++i) {
227
  ggml_cann_buffer& b = buffer_pool[i];
228
- if (b.ptr != nullptr) {
 
 
 
 
 
 
 
 
 
 
 
 
229
  #ifdef DEBUG_CANN_MALLOC
230
- ++nnz;
231
- if (b.size > max_size) max_size = b.size;
 
 
 
 
 
 
 
232
  #endif
233
- if (b.size >= size) {
234
- size_t diff = b.size - size;
235
- if (diff < best_diff) {
236
- best_diff = diff;
237
- ibest = i;
238
- if (!best_diff) {
239
- void* ptr = b.ptr;
240
- *actual_size = b.size;
241
- b.ptr = nullptr;
242
- b.size = 0;
243
- return ptr;
244
- }
245
- }
246
  }
247
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  }
249
- if (ibest >= 0) {
250
- ggml_cann_buffer& b = buffer_pool[ibest];
251
- void* ptr = b.ptr;
252
- *actual_size = b.size;
253
- b.ptr = nullptr;
254
- b.size = 0;
255
  return ptr;
256
  }
257
- void* ptr;
258
- ggml_cann_set_device(device);
259
- ACL_CHECK(
260
- aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
261
- *actual_size = size;
262
- pool_size += size;
 
 
 
 
 
 
 
263
  #ifdef DEBUG_CANN_MALLOC
264
- GGML_LOG_INFO(
265
- "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
266
- "requested %u MB\n",
267
- __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
268
- (uint32_t)(pool_size / 1024 / 1024),
269
- (uint32_t)(size / 1024 / 1024));
 
270
  #endif
271
- return ptr;
 
 
 
272
  }
273
 
274
  /**
@@ -280,16 +533,21 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
280
  void free(void* ptr, size_t size) override {
281
  for (int i = 0; i < MAX_BUFFERS; ++i) {
282
  ggml_cann_buffer& b = buffer_pool[i];
283
- if (b.ptr == nullptr) {
284
- b.ptr = ptr;
285
- b.size = size;
286
- return;
287
  }
 
 
 
 
 
 
 
 
 
 
288
  }
289
- // memory should always buffered. these memory may still needed by
290
- // tasks in stream.
291
- // TODO, fix me.
292
- GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
293
  }
294
  };
295
 
@@ -347,8 +605,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
347
  * @param device The device ID to associate with this buffer pool.
348
  */
349
  explicit ggml_cann_pool_vmm(int device)
350
- : device(device),
351
- granularity(ggml_cann_info().devices[device].vmm_granularity) {
352
  auto dev = ggml_cann_info().devices[device];
353
  granularity = dev.vmm_granularity;
354
  max_size = dev.total_vram;
@@ -471,7 +728,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
471
  */
472
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
473
  int device) {
474
- return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
 
 
 
 
 
 
 
 
 
 
 
475
  }
476
 
477
  // cann buffer
@@ -1020,8 +1288,11 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1020
 
1021
  ggml_cann_set_device(buft_ctx->device);
1022
 
1023
- size = std::max(size, (size_t)1);
1024
-
 
 
 
1025
  void* dev_ptr;
1026
  aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1027
  if (err != ACL_SUCCESS) {
 
29
  #include <cstdio>
30
  #include <cstring>
31
  #include <mutex>
32
+ #include <queue>
33
+ #include <chrono>
34
 
35
  #include "ggml-impl.h"
36
  #include "ggml-backend-impl.h"
 
121
  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
122
  prop.location.id = id;
123
  prop.reserve = 0;
124
+ err = aclrtMemGetAllocationGranularity(
125
  &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
126
+ &info.devices[id].vmm_granularity);
127
+ info.devices[id].vmm = err == ACL_SUCCESS;
128
 
129
  size_t free, total;
130
  ggml_backend_cann_get_device_memory(id, &free, &total);
 
151
 
152
  //#define DEBUG_CANN_MALLOC
153
  /**
154
+ * @brief A pool of CANN buffers(priority segment buffer).
155
  *
156
  * This class manages a pool of CANN buffers for a specific device.
157
  */
158
+ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
159
+ /**
160
+ * @brief The maximum reuse margin for a buffer.
161
+ */
162
+ static const size_t max_reuse_margin = 1ull << 22; // 4MB
163
+
164
+ /**
165
+ * @brief The minimum free margin for a buffer.
166
+ */
167
+ static const size_t min_free_margin = 1ull << 20; // 1MB
168
+
169
+ /**
170
+ * @brief The alignment for buffer allocation.
171
+ */
172
+ static const size_t alignment = 128;
173
+
174
+ /**
175
+ * @brief The device ID associated with this buffer pool.
176
+ */
177
+ int device;
178
+
179
+ /**
180
+ * @brief Whether to disable clean during buffer allocation.
181
+ */
182
+ bool disable_clean = false;
183
+
184
+ /**
185
+ * @brief Structure representing a CANN buffer.
186
+ */
187
+ struct ggml_cann_buffer {
188
+ void* ptr = nullptr; ///< Pointer to the buffer.
189
+ size_t size = 0; ///< Size of the buffer.
190
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
191
+
192
+ bool operator>(const ggml_cann_buffer& other) const {
193
+ return size > other.size;
194
+ }
195
+ };
196
+
197
+ /**
198
+ * @brief Array of CANN buffers in the pool.
199
+ */
200
+ std::unordered_map<void*, size_t> buffer_pool;
201
+ std::priority_queue<ggml_cann_buffer,
202
+ std::vector<ggml_cann_buffer>,
203
+ std::greater<>> free_buffers ;
204
+
205
+ /**
206
+ * @brief Total size of all buffers in the pool.
207
+ */
208
+ size_t pool_size = 0;
209
+
210
+ /**
211
+ * @brief Constructor to initialize the buffer pool for a specific device.
212
+ *
213
+ * @param device The device ID to associate with this buffer pool.
214
+ */
215
+ explicit ggml_cann_pool_buf_prio(int device) : device(device) {
216
+ disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
217
+ }
218
+
219
+ /**
220
+ * @brief Destructor to free all buffers in the pool.
221
+ */
222
+ ~ggml_cann_pool_buf_prio() {
223
+ ggml_cann_set_device(device);
224
+ for (auto& [b_ptr, b_size] : buffer_pool) {
225
+ aclrtFree(b_ptr);
226
+ pool_size -= b_size;
227
+ }
228
+ buffer_pool.clear();
229
+ GGML_ASSERT(pool_size == 0);
230
+ }
231
+
232
+ /**
233
+ * @brief Allocate a buffer of the given size.
234
+ *
235
+ * @param size The size of the buffer to allocate.
236
+ * @param actual_size A pointer to a variable to receive the actual size of
237
+ * the allocated buffer.
238
+ * @return A pointer to the allocated buffer.
239
+ */
240
+ void* alloc(size_t size, size_t* actual_size) override {
241
+ size = GGML_PAD(size, alignment);
242
+ if (size == 0) {
243
+ size = alignment;
244
+ }
245
+
246
+ void* ptr = nullptr;
247
+ auto now = std::chrono::steady_clock::now();
248
+
249
+ std::vector<ggml_cann_buffer> free_buffers_rest;
250
+ free_buffers_rest.reserve(free_buffers.size());
251
+ while (!free_buffers.empty()) {
252
+ auto b = free_buffers.top();
253
+ free_buffers.pop();
254
+
255
+ if (b.size >= size) {
256
+ // reuse the buffer if the size is enough
257
+ const size_t margin = b.size - size;
258
+ if (margin <= max_reuse_margin) {
259
+ *actual_size = b.size;
260
+ ptr = b.ptr;
261
+ #ifdef DEBUG_CANN_MALLOC
262
+ GGML_LOG_INFO(
263
+ "cann pool[%d]: reused %p, "
264
+ "pool_size = %5u MB, "
265
+ "size = %5u MB, "
266
+ "margin = %5u MB\n",
267
+ device, b.ptr,
268
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
269
+ (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
270
+ (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
271
+ #endif
272
+ break;
273
+ }
274
+ }
275
+
276
+ bool should_clean = !disable_clean &&
277
+ b.size > min_free_margin &&
278
+ std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
279
+ if (should_clean) {
280
+ // free the buffer if the size is needed to be freed
281
+ ACL_CHECK(aclrtFree(b.ptr));
282
+ pool_size -= b.size;
283
+ buffer_pool.erase(b.ptr);
284
+ #ifdef DEBUG_CANN_MALLOC
285
+ GGML_LOG_INFO(
286
+ "cann pool[%d]: clean %p, "
287
+ "pool_size = %5u MB, "
288
+ "size = %5u MB\n",
289
+ device, b.ptr,
290
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
291
+ (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
292
+ #endif
293
+ continue;
294
+ }
295
+ free_buffers_rest.push_back(b);
296
+ }
297
+ for (ggml_cann_buffer &b : free_buffers_rest) {
298
+ free_buffers.push(std::move(b));
299
+ }
300
+
301
+ #ifdef DEBUG_CANN_MALLOC
302
+ GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
303
+ #endif
304
+ if (ptr != nullptr) {
305
+ return ptr;
306
+ }
307
+
308
+ // allocate a new buffer if no buffer can be reused
309
+ ggml_cann_set_device(device);
310
+ ACL_CHECK(aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
311
+ *actual_size = size;
312
+ pool_size += size;
313
+ #ifdef DEBUG_CANN_MALLOC
314
+ GGML_LOG_INFO(
315
+ "cann pool[%d]: allocate %p, "
316
+ "pool_size = %5u MB, "
317
+ "size = %5u MB\n",
318
+ device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
319
+ (uint32_t)(GGML_PAD(size, 1048576) / 1048576));
320
+ #endif
321
+ buffer_pool.emplace(ptr, size);
322
+ return ptr;
323
+ }
324
+
325
+ /**
326
+ * @brief Free a buffer and return it to the pool.
327
+ *
328
+ * @param ptr Pointer to the buffer to free.
329
+ * @param size Size of the buffer to free.
330
+ */
331
+ void free(void* ptr, size_t size) override {
332
+ auto it = buffer_pool.find(ptr);
333
+ if (it == buffer_pool.end()) {
334
+ GGML_ABORT("cann pool[%d]: buffer %p not found in pool\n", device, ptr);
335
+ }
336
+
337
+ auto now = std::chrono::steady_clock::now();
338
+ free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
339
+ #ifdef DEBUG_CANN_MALLOC
340
+ GGML_LOG_INFO(
341
+ "cann pool[%d]: return %p, "
342
+ "pool_size = %5u MB\n",
343
+ device, ptr,
344
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
345
+ #endif
346
+ }
347
+ };
348
+
349
+ /**
350
+ * @brief A pool of CANN buffers(segment buffer).
351
+ *
352
+ * This class manages a pool of CANN buffers for a specific device.
353
+ */
354
+ struct ggml_cann_pool_buf : public ggml_cann_pool {
355
+ /**
356
+ * @brief The maximum reuse margin for a buffer.
357
+ */
358
+ static const size_t max_reuse_margin = 1ull << 22; // 4MB
359
+
360
+ /**
361
+ * @brief The minimum free margin for a buffer.
362
+ */
363
+ static const size_t min_free_margin = 1ull << 20; // 1MB
364
+
365
+ /**
366
+ * @brief The alignment for buffer allocation.
367
+ */
368
+ static const size_t alignment = 128;
369
+
370
  /**
371
  * @brief The maximum number of buffers in the pool.
372
  */
 
377
  */
378
  int device;
379
 
380
+ /**
381
+ * @brief Whether to disable clean during buffer allocation.
382
+ */
383
+ bool disable_clean = false;
384
+
385
  /**
386
  * @brief Structure representing a CANN buffer.
387
  */
388
  struct ggml_cann_buffer {
389
  void* ptr = nullptr; ///< Pointer to the buffer memory.
390
  size_t size = 0; ///< Size of the buffer.
391
+ bool used = false; ///< Whether the buffer is currently in use.
392
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
393
  };
394
 
395
  /**
 
407
  *
408
  * @param device The device ID to associate with this buffer pool.
409
  */
410
+ explicit ggml_cann_pool_buf(int device) : device(device) {
411
+ disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
412
+ }
413
 
414
  /**
415
  * @brief Destructor to free all buffers in the pool.
416
  */
417
+ ~ggml_cann_pool_buf() {
418
  ggml_cann_set_device(device);
419
  for (int i = 0; i < MAX_BUFFERS; ++i) {
420
  ggml_cann_buffer& b = buffer_pool[i];
421
  if (b.ptr != nullptr) {
422
+ aclrtFree(b.ptr);
423
  pool_size -= b.size;
424
  }
425
  }
 
435
  * @return A pointer to the allocated buffer.
436
  */
437
  void* alloc(size_t size, size_t* actual_size) override {
 
438
  size = GGML_PAD(size, alignment);
439
  if (size == 0) {
440
  size = alignment;
441
  }
442
+
443
+ void* ptr = nullptr;
444
+ auto now = std::chrono::steady_clock::now();
445
+
446
+ int i = 0;
447
+ for (; i < MAX_BUFFERS; ++i) {
 
448
  ggml_cann_buffer& b = buffer_pool[i];
449
+ if (b.ptr == nullptr) {
450
+ break;
451
+ }
452
+ if (b.used) {
453
+ continue;
454
+ }
455
+ if (b.size >= size) {
456
+ // reuse the buffer if the size is enough
457
+ const size_t margin = b.size - size;
458
+ if (margin <= max_reuse_margin) {
459
+ *actual_size = b.size;
460
+ b.used = true;
461
+ ptr = b.ptr;
462
  #ifdef DEBUG_CANN_MALLOC
463
+ GGML_LOG_INFO(
464
+ "cann pool[%d]: reused %p, "
465
+ "pool_size = %5u MB, "
466
+ "size = %5u MB, "
467
+ "margin = %5u MB\n",
468
+ device, b.ptr,
469
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
470
+ (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
471
+ (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
472
  #endif
473
+ break;
 
 
 
 
 
 
 
 
 
 
 
 
474
  }
475
  }
476
+
477
+ bool should_clean = !disable_clean &&
478
+ b.size > min_free_margin &&
479
+ std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
480
+ if (should_clean) {
481
+ // free the buffer if the size is needed to be freed
482
+ ACL_CHECK(aclrtFree(b.ptr));
483
+ pool_size -= b.size;
484
+ #ifdef DEBUG_CANN_MALLOC
485
+ GGML_LOG_INFO(
486
+ "cann pool[%d]: clean %p, "
487
+ "pool_size = %5u MB, "
488
+ "size = %5u MB\n",
489
+ device, b.ptr,
490
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
491
+ (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
492
+ #endif
493
+ b.ptr = nullptr;
494
+ }
495
  }
496
+ if (ptr != nullptr) {
 
 
 
 
 
497
  return ptr;
498
  }
499
+
500
+ if (i < MAX_BUFFERS) {
501
+ // allocate a new buffer if no buffer can be reused
502
+ ggml_cann_buffer& b = buffer_pool[i];
503
+ ggml_cann_set_device(device);
504
+ ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
505
+ pool_size += size;
506
+ *actual_size = size;
507
+ b.size = size;
508
+ b.used = true;
509
+ if (i >= MAX_BUFFERS - 8) {
510
+ GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
511
+ }
512
  #ifdef DEBUG_CANN_MALLOC
513
+ GGML_LOG_INFO(
514
+ "cann pool[%d]: allocate %p, "
515
+ "pool_size = %5u MB, "
516
+ "size = %5u MB\n",
517
+ device, b.ptr,
518
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
519
+ (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
520
  #endif
521
+ return b.ptr;
522
+ }
523
+
524
+ GGML_ABORT("cann pool[%d]: slots full\n", device);
525
  }
526
 
527
  /**
 
533
  void free(void* ptr, size_t size) override {
534
  for (int i = 0; i < MAX_BUFFERS; ++i) {
535
  ggml_cann_buffer& b = buffer_pool[i];
536
+ if (b.ptr != ptr) {
537
+ continue;
 
 
538
  }
539
+ b.used = false;
540
+ b.last_used = std::chrono::steady_clock::now();
541
+ #ifdef DEBUG_CANN_MALLOC
542
+ GGML_LOG_INFO(
543
+ "cann pool[%d]: return %p, "
544
+ "pool_size = %5u MB\n",
545
+ device, b.ptr,
546
+ (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
547
+ #endif
548
+ return;
549
  }
550
+ GGML_ABORT("cann pool[%d]: slots full\n", device);
 
 
 
551
  }
552
  };
553
 
 
605
  * @param device The device ID to associate with this buffer pool.
606
  */
607
  explicit ggml_cann_pool_vmm(int device)
608
+ : device(device) {
 
609
  auto dev = ggml_cann_info().devices[device];
610
  granularity = dev.vmm_granularity;
611
  max_size = dev.total_vram;
 
728
  */
729
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
730
  int device) {
731
+ bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
732
+ if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
733
+ GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
734
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
735
+ }
736
+ bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
737
+ if (enable_buf_prio) {
738
+ GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
739
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
740
+ }
741
+ GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
742
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
743
  }
744
 
745
  // cann buffer
 
1288
 
1289
  ggml_cann_set_device(buft_ctx->device);
1290
 
1291
+ const size_t alignment = 128;
1292
+ size = GGML_PAD(size, alignment);
1293
+ if (size == 0) {
1294
+ size = alignment;
1295
+ }
1296
  void* dev_ptr;
1297
  aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1298
  if (err != ACL_SUCCESS) {